Analysis of Intron + Exon CLIP peaks

Set Analysis mode

CLIPtype:
Introns - Select peaks that are classified as "Protein Coding : Exons"
Exons - Select peaks that are classified as "Protein Coding : Introns"
Introns_LINESINETLR - Select peaks that are classified as Protein Coding : Introns and Intron peaks that are classified as "Repeat Elements" but overelap with LINE SINE or TLR

RunMode:
Genomic - Include Intronic and Exonic Sequences when finding 100nt region Upsteram and Downstream of CLIP peak
Transcriptomic - Only include Exonic Sequences when finding 100nt region Upsteram and Downstream of CLIP peak. If Upstream/Downstream region crosses Intronic region, the intronic sequence is skpped and the next exon in included.

TranscLoc: select subset of Protein Coding CLIP

Overlap with LINE/SINE

Select Any CLIP peaks that overlap with intronic regions. If Peak also Overlaps with a Repeat Element it is removed, except for peaks that overlap with LINE, SINE or LTR.

Get Sequences

sequences are taken window nts Upstream and Downstream of the 5' end of the CLIP peak.

For RunMode='Genomic' sequences are taken from the Genomic sequence irregardless whether it contains Intronic/Exonic regions.
For RunMode='Transcriptomic' sequences are taken from the spliced transcript sequence. If 100nt region crosses into Intronic region. The Intron sequence is not considered and taken from the next Exon.

Gquad identification

G-quadroplex were identified by two methods.
1) fastaRegexFinder
fastaRegexFinder Identifies G-quadroplex based on the supplied Regular Expression for each sequence supplied in the fasta file.

fastaRegexFinder.py -f CLIPsequences.fa -r ([gG]{2,4}w{1,7}){3,}[gG]{2,4} --noreverse > output.txt

  1. QGRS-Mapper
    QGRS-Mapper Identifies and scores G-quads based on GxNy1GxNy2GxNy3Gx.

qgrs -s -csv -i SingleCLIPseqence.fa -o output.txt

GQuad Analysis

##```{r , include=T,fig.asp=1.25, fig.align='center',fig.height=4, fig.width=3}
    

#######################################       
### # of gquad identified
#######################################       

      GdistRando=as.matrix(read.delim(paste0(inRandom,'/Random1/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/upstream/Table/Col_GscoreDist_UP.txt'),comment.char = "%",header = F,sep = "\t",stringsAsFactors = F))
    
      for (rand in c(2:12)) {

            GdistRando2=as.matrix(read.delim(paste0(inRandom,'/Random',rand,'/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/upstream/Table/Col_GscoreDist_UP.txt'),comment.char = "%",header = F,sep = "\t",stringsAsFactors = F))
           GdistRando=as.data.frame(cbind(GdistRando,GdistRando2))
      }

            GdistRandoP=as.data.frame(colSums(is.na(GdistRando)==F))
            colnames(GdistRandoP)=c("Number_Seq_with_GQuads")

            GdistRandoP=GdistRandoP[GdistRandoP$Number_Seq_with_GQuads>0,,drop=F]
            GdistRandoP[,1]=as.numeric(GdistRandoP[,1])
      
       ggplot(GdistRandoP,aes(x=Number_Seq_with_GQuads))+geom_density(size= .6) + theme_classic()+ylab("Density")+xlab("# Locations with identified G quadruplex")+ggtitle('Upstream:# of sequencens with quadruplex' )+
       geom_vline(xintercept=nrow(checkout2[is.na(checkout2$qgrs_maxScore)==F,]),colour="red") +xlim(15,70)
## Warning: Removed 943 rows containing non-finite values (stat_density).
## Warning: Removed 1 rows containing missing values (geom_vline).

####################

      GdistRando_dn=as.matrix(read.delim(paste0(inRandom,'/Random1/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/downstream/Table/Col_GscoreDist_DN.txt'),comment.char = "%",header = F,sep = "\t",stringsAsFactors = F))
     
      for (rand in c(2:12)) {
            GdistRando_dn2=as.matrix(read.delim(paste0(inRandom,'/Random',rand,'/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/downstream/Table/Col_GscoreDist_DN.txt'),comment.char = "%",header = F,sep = "\t",stringsAsFactors = F))
       GdistRando_dn=as.data.frame(cbind(GdistRando_dn,GdistRando_dn2));
      }
       
               GdistRando_dnP=as.data.frame(colSums(is.na(GdistRando_dn)==F))
               colnames(GdistRando_dnP)=c("Number_Seq_with_GQuads")
               GdistRando_dnP=GdistRando_dnP[GdistRando_dnP$Number_Seq_with_GQuads>0,,drop=F]

               GdistRando_dnP[,1]=as.numeric(GdistRando_dnP[,1])
      
       ggplot(GdistRando_dnP,aes(x=Number_Seq_with_GQuads))+geom_density(size= .6) + theme_classic()+ylab("Density")+xlab("# Locations with identified G quadruplex")+ggtitle('Downstream:# of sequencens with quadruplex' )+
       geom_vline(xintercept=nrow(checkout2_dn[is.na(checkout2_dn$qgrs_maxScore)==F,]),colour="red") +xlim(15,70)
## Warning: Removed 941 rows containing non-finite values (stat_density).

## Warning: Removed 1 rows containing missing values (geom_vline).

# intersect(checkout2$sequence,checkout2_dn$sequence)
# intersect(checkout2$V7,checkout2_dn$V7)
#        


#######################################       
### Distance of Gquad from CLIP start       
#######################################       
       
        GscrRando=as.matrix(read.delim(paste0(inRandom,'/Random1/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/upstream/Table/Col_Gscore_UP.txt'),comment.char = "%",header = F,sep = "\t",stringsAsFactors = F))
                   for (rand in c(2:12)) {

      GdscrRando2=as.matrix(read.delim(paste0(inRandom,'/Random',rand,'/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/upstream/Table/Col_Gscore_UP.txt'),comment.char = "%",header = F,sep = "\t",stringsAsFactors = F))
      GscrRando=as.data.frame(cbind(GscrRando,GdscrRando2));
                   }
       
      DistnaceDist=as.data.frame(matrix(nrow = nrow(GscrRando),ncol = ncol(GscrRando)))
      GscoreMax=as.data.frame(matrix(nrow = nrow(GscrRando),ncol = ncol(GscrRando)))
      GscoreMaxfrac=as.data.frame(matrix(nrow = nrow(GscrRando),ncol = ncol(GscrRando)))

      for (col in 1:ncol(GscrRando)) {
                  colscr1=((str_split(GscrRando[,col],pattern = ",")))
                  colscr1max=unlist(lapply(colscr1,  function(x) unique(max(as.numeric(x))) ))
                  
                  coldist1=((str_split(GdistRando[,col],pattern = ",")))

                  rowmaxscore=as.data.frame(matrix(nrow=length(colscr1),ncol=1))
                  rowmaxscoredist=as.data.frame(matrix(nrow=length(colscr1),ncol=1))
              for (r in 1:length(colscr1)) {
                        maxscr=grep(colscr1max[r],colscr1[[r]])
                        maxscrdist=as.numeric(coldist1[[r]][maxscr])
                       if (length(maxscr>1)) {
                         maxscr=maxscr[1]
                         maxscrdist=min(as.numeric(coldist1[[r]]))
                         }
                    rowmaxscore[r,]=maxscr
                    rowmaxscoredist[r,]=maxscrdist
              }     
                  DistnaceDist[,col]=rowmaxscoredist[,1]
                   GscoreMax[,col]=colscr1max
                   GscoreMaxfrac[1,col]=sum(colscr1max>21,na.rm = T)/(sum(is.na(colscr1max)==F,na.rm = T))
                   # GscoreMaxfrac[1,col]=sum(colscr1max>21,na.rm = T)

              # xxx=cbind(rowmaxscore,rowdist,GscrRando[,col],GdistRando[,col])
      }
       
      R=melt(DistnaceDist);colnames(R)[2]="qgrsdist_maxScore"
## Using  as id variables
      R$variable=as.character(R$variable)
      
      D=checkout2[,'qgrsdist_maxScore',drop=F]
      D$variable="upstream"
      
ggplot(R,aes(x=qgrsdist_maxScore,line=variable))+
   geom_density(data=R,size= .6,color='grey') +
  geom_density(data=D,size= .6,color='red')+
  theme_classic()+ylab("Density")+xlab("G-Quadruplexes Distance from 5' CLIP peak (nt)")+ggtitle('Upstream G-Quadruplexes Distance')
## Warning: Removed 373042 rows containing non-finite values (stat_density).
## Warning: Removed 574 rows containing non-finite values (stat_density).

       #geom_vline(xintercept=nrow(checkout2[is.na(checkout2$qgrs_maxScore)==F,]),colour="red") +xlim(0,100)

       
       R=melt(GscoreMax);colnames(R)[2]="qgrs_maxScore"
## Using  as id variables
      R$variable=as.character(R$variable)
      
      D=as.data.frame(as.numeric(checkout2[,'qgrs_maxScore',drop=T]))
      colnames(D)[1]="qgrs_maxScore"
      D$variable="downstream"
      
ggplot(R,aes(x=qgrs_maxScore,line=variable))+
   geom_density(data=R,size= .6,color='grey') +
  geom_density(data=D,size= .6,color='red')+
  theme_classic()+ylab("Density")+xlab("G-Quadruplexes Score (qgrs)")+ggtitle('Upstream G-Quadruplexes score')
## Warning: Removed 373042 rows containing non-finite values (stat_density).

## Warning: Removed 574 rows containing non-finite values (stat_density).

       #geom_vline(xintercept=nrow(checkout2[is.na(checkout2$qgrs_maxScore)==F,]),colour="red") +xlim(0,100)
 

########

 R=as.data.frame(t(GscoreMaxfrac[1,]));
 colnames(R)="qgrs_maxScoreFrac"
      # R$variable=as.character(R$variable)
      
   
ggplot(R,aes(x=qgrs_maxScoreFrac))+geom_density(size= .6) + theme_classic()+ylab("Density")+xlab("Fraction of G-Quadruplexs with scores >21")+ggtitle('Upstream: Fraction of G-Quadruplexs with High scores' )+
       geom_vline(xintercept=sum(as.numeric(checkout2$qgrs_maxScore)>21,na.rm = T)/sum(is.na(checkout2$qgrs_maxScore)==F,na.rm = T),colour="red") 
## Warning: Removed 257 rows containing non-finite values (stat_density).

 # ggplot(R,aes(x=qgrs_maxScoreFrac))+geom_density(size= .6) + theme_classic()+ylab("Density")+xlab("Upstream: Fraction of G-Quadruplexs with scores >21")+ggtitle('# of G-Quadruplexs with High scores' )+
 #       geom_vline(xintercept=sum(as.numeric(checkout2$qgrs_maxScore)>21,na.rm = T),colour="red")
   

      
#########

GscrRando_dn=as.matrix(read.delim(paste0(inRandom,'/Random1/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/downstream/Table/Col_Gscore_DN.txt'),comment.char = "%",header = F,sep = "\t",stringsAsFactors = F))
            for (rand in c(2:12)) {

      GdscrRando_dn2=as.matrix(read.delim(paste0(inRandom,'/Random',rand,'/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/downstream/Table/Col_Gscore_DN.txt'),comment.char = "%",header = F,sep = "\t",stringsAsFactors = F))
      GscrRando_dn=as.data.frame(cbind(GscrRando_dn,GdscrRando_dn2));
            }
      
      DistnaceDist_dn=as.data.frame(matrix(nrow = nrow(GscrRando_dn),ncol = ncol(GscrRando_dn)))
      GscoreMax_dn=as.data.frame(matrix(nrow = nrow(GscrRando_dn),ncol = ncol(GscrRando_dn)))
      GscoreMaxfrac_dn=as.data.frame(matrix(nrow = nrow(GscrRando_dn),ncol = ncol(GscrRando_dn)))
      for (col in 1:ncol(GscrRando_dn)) {
                  colscr1=((str_split(GscrRando_dn[,col],pattern = ",")))
                  colscr1max=unlist(lapply(colscr1,  function(x) unique(max(as.numeric(x))) ))
                  
                  coldist1=((str_split(GdistRando_dn[,col],pattern = ",")))

                  rowmaxscore=as.data.frame(matrix(nrow=length(colscr1),ncol=1))
                  rowmaxscoredist=as.data.frame(matrix(nrow=length(colscr1),ncol=1))
              for (r in 1:length(colscr1)) {
                        maxscr=grep(colscr1max[r],colscr1[[r]])
                        maxscrdist=as.numeric(coldist1[[r]][maxscr])
                       if (length(maxscr>1)) {
                         maxscr=maxscr[1]
                         maxscrdist=min(as.numeric(coldist1[[r]]))
                         }
                    rowmaxscore[r,]=maxscr
                    rowmaxscoredist[r,]=maxscrdist
              }     
                  DistnaceDist_dn[,col]=rowmaxscoredist[,1]
                  GscoreMax_dn[,col]=colscr1max
                  GscoreMaxfrac_dn[,col]=sum(colscr1max>21,na.rm = T)/sum(is.na(colscr1max)==F,na.rm = T)

              # xxx=cbind(rowmaxscore,rowdist,GscrRando_dn[,col],GdistRando_dn[,col])
      }
       
      R=melt(DistnaceDist_dn);colnames(R)[2]="qgrsdist_maxScore"
## Using  as id variables
      R$variable=as.character(R$variable)
      
      D=checkout2_dn[,'qgrsdist_maxScore',drop=F]
      D$variable="downstream"
      
ggplot(R,aes(x=qgrsdist_maxScore,line=variable))+
   geom_density(data=R,size= .6,color='grey') +
  geom_density(data=D,size= .6,color='red')+
  theme_classic()+ylab("Density")+xlab("G-Quadruplexes Distance from 5' CLIP peak (nt)")+ggtitle('Downstream G-Quadruplexes Distance')
## Warning: Removed 335394 rows containing non-finite values (stat_density).
## Warning: Removed 605 rows containing non-finite values (stat_density).

       #geom_vline(xintercept=nrow(checkout2_dn[is.na(checkout2_dn$qgrs_maxScore)==F,]),colour="red") #+xlim(0,100)



 R=melt(GscoreMax_dn);colnames(R)[2]="qgrs_maxScore"
## Using  as id variables
      R$variable=as.character(R$variable)
      
D=as.data.frame(as.numeric(checkout2_dn[,'qgrs_maxScore',drop=T]))
      colnames(D)[1]="qgrs_maxScore"
      D$variable="downstream"
      
ggplot(R,aes(x=qgrs_maxScore,line=variable))+
   geom_density(data=R,size= .6,color='grey') +
  geom_density(data=D,size= .6,color='red')+
  theme_classic()+ylab("Density")+xlab("G-Quadruplexes Score (qgrs)")+ggtitle('Downstream G-Quadruplexes score')
## Warning: Removed 335394 rows containing non-finite values (stat_density).

## Warning: Removed 605 rows containing non-finite values (stat_density).

       #geom_vline(xintercept=nrow(checkout2_dn[is.na(checkout2_dn$qgrs_maxScore)==F,]),colour="red") #+xlim(0,100)



########

 R=as.data.frame(t(GscoreMaxfrac_dn[1,]));
 colnames(R)="qgrs_maxScoreFrac"
      # R$variable=as.character(R$variable)
      
   
ggplot(R,aes(x=qgrs_maxScoreFrac))+geom_density(size= .6) + theme_classic()+ylab("Density")+xlab("Fraction of G-Quadruplexs with scores >21")+ggtitle('Downstream: Fraction of G-Quadruplexs with High scores' )+
       geom_vline(xintercept=sum(as.numeric(checkout2_dn$qgrs_maxScore)>21,na.rm = T)/sum(is.na(checkout2_dn$qgrs_maxScore)==F,na.rm = T),colour="red") 
## Warning: Removed 259 rows containing non-finite values (stat_density).

Nucleotide Content

Individual nucleotide content for the Upstream and Dwonstream sequences of the supplied CLIP location.

Upstream

# ```{r , include=T,fig.asp=1.25, fig.align='center',fig.height=4, fig.width=3,eval=T}

  # upstream   
    
    for (ch in c(10,20,30,40,50)) { 
    # for (ch in c(20)) { 
    chadj=ch-1

    chucnkseqA=as.data.frame(matrix(nrow=nrow(checkout2),ncol=(nchar(checkout2[1,'sequence'])-chadj)))
    rownames(chucnkseqA)=checkout2$ID;colnames(chucnkseqA)=seq(-(nchar(checkout2[1,'sequence'])-chadj),-1)
    chucnkseqU=as.data.frame(matrix(nrow=nrow(checkout2),ncol=(nchar(checkout2[1,'sequence'])-chadj)))
    rownames(chucnkseqU)=checkout2$ID;colnames(chucnkseqU)=seq(-(nchar(checkout2[1,'sequence'])-chadj),-1)
    chucnkseqG=as.data.frame(matrix(nrow=nrow(checkout2),ncol=(nchar(checkout2[1,'sequence'])-chadj)))
    rownames(chucnkseqG)=checkout2$ID;colnames(chucnkseqG)=seq(-(nchar(checkout2[1,'sequence'])-chadj),-1)
    chucnkseqC=as.data.frame(matrix(nrow=nrow(checkout2),ncol=(nchar(checkout2[1,'sequence'])-chadj)))
    rownames(chucnkseqC)=checkout2$ID;colnames(chucnkseqC)=seq(-(nchar(checkout2[1,'sequence'])-chadj),-1)
    
    for (l in 1:(nchar(checkout2[1,'sequence'])-chadj)) {
      ss= substr(checkout2[,'sequence'], start = l, stop = (l+chadj))
      chucnkseqA[,l]=str_count(ss, "A") /ch
      chucnkseqU[,l]=str_count(ss, "U") /ch
      chucnkseqG[,l]=str_count(ss, "G") /ch
      chucnkseqC[,l]=str_count(ss, "C") /ch
    }
    
        chucnkseqG=chucnkseqG[order(rowMaxs(as.matrix(chucnkseqG)),decreasing = T),]

      # NucHeatPlot(chucnkseqA,checkout2,chucnkseqG,"A",'upstream')
      # NucHeatPlot(chucnkseqU,checkout2,chucnkseqG,"U",'upstream')
      # NucHeatPlot(chucnkseqG,checkout2,chucnkseqG,"G",'upstream')
      # NucHeatPlot(chucnkseqC,checkout2,chucnkseqG,"C",'upstream')
        
      

      cht=cbind((colMeans(chucnkseqA)),(colMeans(chucnkseqU)),(colMeans(chucnkseqC)),(colMeans(chucnkseqG)))
colnames(cht)=c('A','U','C','G')
cht=melt(cht)
colnames(cht)=c('distance','Nucleotide','Percent')

####### plot CLIP peak data
# print(ggplot(cht,aes(y=Percent,x=distance,line=Nucleotide,color=Nucleotide))+
#                  xlab("Position of Window\n(Distance from 5'Clip peak)")+ylab('Nucleotide content ')+theme_classic()+ggtitle(paste0('window size=',ch,' - upstream 100nt'))+
#       geom_line(data=cht,na.rm=F)+  ylim(.1,.4))

########### Import Random data
    
      GRandom_A=as.matrix(read.delim(paste0(inRandom,'/Random1/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/upstream/chucnkseqA_window',ch,'.txt'),comment.char = "%",header = T,sep = "\t",stringsAsFactors = F))
                        colnames(GRandom_A)=seq(-(nchar(checkout2[1,'sequence'])-chadj),-1)

      GRandom_U=as.matrix(read.delim(paste0(inRandom,'/Random1/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/upstream/chucnkseqU_window',ch,'.txt'),comment.char = "%",header = T,sep = "\t",stringsAsFactors = F))
                              colnames(GRandom_U)=seq(-(nchar(checkout2[1,'sequence'])-chadj),-1)  
                              
      GRandom_G=as.matrix(read.delim(paste0(inRandom,'/Random1/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/upstream/chucnkseqG_window',ch,'.txt'),comment.char = "%",header = T,sep = "\t",stringsAsFactors = F))
                              colnames(GRandom_G)=seq(-(nchar(checkout2[1,'sequence'])-chadj),-1) 
      
       GRandom_C=as.matrix(read.delim(paste0(inRandom,'/Random1/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/upstream/chucnkseqC_window',ch,'.txt'),comment.char = "%",header = T,sep = "\t",stringsAsFactors = F))
                              colnames(GRandom_C)=seq(-(nchar(checkout2[1,'sequence'])-chadj),-1)                                                
                              
for (rand in c(2:12)) {
  GRandom2_Ax=as.matrix(read.delim(paste0(inRandom,'/Random',rand,'/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/upstream/chucnkseqA_window',ch,'.txt'),comment.char = "%",header = T,sep = "\t",stringsAsFactors = F))
                        colnames(GRandom2_Ax)=seq(-(nchar(checkout2[1,'sequence'])-chadj),-1)
                         GRandom_A=rbind(GRandom_A,GRandom2_Ax)
      
  GRandom2_Ux=as.matrix(read.delim(paste0(inRandom,'/Random',rand,'/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/upstream/chucnkseqU_window',ch,'.txt'),comment.char = "%",header = T,sep = "\t",stringsAsFactors = F))
                        colnames(GRandom2_Ux)=seq(-(nchar(checkout2[1,'sequence'])-chadj),-1)
                         GRandom_U=rbind(GRandom_U,GRandom2_Ux)
                         
  GRandom2_Gx=as.matrix(read.delim(paste0(inRandom,'/Random',rand,'/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/upstream/chucnkseqG_window',ch,'.txt'),comment.char = "%",header = T,sep = "\t",stringsAsFactors = F))
                        colnames(GRandom2_Gx)=seq(-(nchar(checkout2[1,'sequence'])-chadj),-1)
                         GRandom_G=rbind(GRandom_G,GRandom2_Gx)
                         
  GRandom2_Cx=as.matrix(read.delim(paste0(inRandom,'/Random',rand,'/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/upstream/chucnkseqC_window',ch,'.txt'),comment.char = "%",header = T,sep = "\t",stringsAsFactors = F))
                        colnames(GRandom2_Cx)=seq(-(nchar(checkout2[1,'sequence'])-chadj),-1)
                         GRandom_C=rbind(GRandom_C,GRandom2_Cx)
  
}
      
                colnames(GRandom_A)=seq(-(nchar(checkout2[1,'sequence'])-chadj),-1)
            GRandom_A_avg=colMeans(GRandom_A)    
                GRandom_A= melt(t(GRandom_A))
                GRandom_A$X2=as.character(GRandom_A$X2)
                colnames(GRandom_A)=c('distance','Nucleotide','Percent')

                colnames(GRandom_U)=seq(-(nchar(checkout2[1,'sequence'])-chadj),-1)
            GRandom_U_avg=colMeans(GRandom_U)    
                GRandom_U= melt(t(GRandom_U))
                GRandom_U$X2=as.character(GRandom_U$X2)
                colnames(GRandom_U)=c('distance','Nucleotide','Percent')
      
                colnames(GRandom_G)=seq(-(nchar(checkout2[1,'sequence'])-chadj),-1)
            GRandom_G_avg=colMeans(GRandom_G)    
                GRandom_G= melt(t(GRandom_G))
                GRandom_G$X2=as.character(GRandom_G$X2)
                colnames(GRandom_G)=c('distance','Nucleotide','Percent')
                
                colnames(GRandom_C)=seq(-(nchar(checkout2[1,'sequence'])-chadj),-1)
            GRandom_C_avg=colMeans(GRandom_C)    
                GRandom_C= melt(t(GRandom_C))
                GRandom_C$X2=as.character(GRandom_C$X2) 
                colnames(GRandom_C)=c('distance','Nucleotide','Percent')

                
 Rht=cbind(((GRandom_A_avg)),((GRandom_U_avg)),((GRandom_C_avg)),((GRandom_G_avg)))
colnames(Rht)=c('A','U','C','G')
Rht=melt(Rht)
colnames(Rht)=c('distance','Nucleotide','Percent')

####### plot Random Averages
# print(ggplot(Rht,aes(y=Percent,x=distance,line=Nucleotide,color=Nucleotide))+
#                  xlab("Position of Window\n(Distance from 5'Clip peak)")+ylab('Nucleotide content')+theme_classic()+ggtitle(paste0('Random Sequenc:window size=',ch,' - upstream 100nt'))+
#       geom_line(na.rm=F))


####### plot all nuc with all random
# print(ggplot(cht,aes(y=Percent,x=distance,line=Nucleotide,color=Nucleotide))+
#                  xlab("Position of Window\n(Distance from 5'Clip peak)")+ylab('Nucleotide content')+theme_classic()+ggtitle(paste0('window size=',ch,' - upstream 100nt'))+
#       geom_line(data = GRandom_A,color='red',alpha=0.05)+
#       geom_line(data = GRandom_U,color='purple',alpha=0.05)+
#       geom_line(data = GRandom_G,color='blue',alpha=0.05)+
#       geom_line(data = GRandom_C,color='green',alpha=0.05)+
#       geom_line(data=cht,na.rm=F))


###
print(ggplot(cht,aes(y=Percent,x=distance,line=Nucleotide,color=Nucleotide))+
                 xlab("Position of Window\n(Distance from 5'Clip peak)")+ylab('Nucleotide content ')+theme_classic()+ggtitle(paste0('window size=',ch,' - upstream 100nt'))+
      geom_line(data=cht,na.rm=F)+
        geom_line(data=Rht,na.rm=F,alpha=0.25,size=2)+  ylim(.1,.4))

################# PLOT INDIVIDUAL Nuc with Random backgound
chtx=cht[cht$Nucleotide%in%"A",]
print(ggplot(chtx,aes(y=Percent,x=distance,line=Nucleotide,color=Nucleotide))+
                 xlab("Position of Window\n(Distance from 5'Clip peak)")+ylab('Nucleotide content (A)')+theme_classic()+ggtitle(paste0('window size=',ch,' - upstream 100nt'))+
      geom_line(data = GRandom_A,color='grey',alpha=0.05)+
      geom_line(data=chtx,na.rm=F,color='red')+  ylim(.1,.4))

chtx=cht[cht$Nucleotide%in%"U",]
print(ggplot(chtx,aes(y=Percent,x=distance,line=Nucleotide,color=Nucleotide))+
                 xlab("Position of Window\n(Distance from 5'Clip peak)")+ylab('Nucleotide content (U)')+theme_classic()+ggtitle(paste0('window size=',ch,' - upstream 100nt'))+
      geom_line(data = GRandom_U,color='grey',alpha=0.05)+
      geom_line(data=chtx,na.rm=F,color='purple')+  ylim(.1,.4))

chtx=cht[cht$Nucleotide%in%"G",]
print(ggplot(chtx,aes(y=Percent,x=distance,line=Nucleotide,color=Nucleotide))+
                 xlab("Position of Window\n(Distance from 5'Clip peak)")+ylab('Nucleotide content (G)')+theme_classic()+ggtitle(paste0('window size=',ch,' - upstream 100nt'))+
      geom_line(data = GRandom_G,color='grey',alpha=0.05)+
      geom_line(data=chtx,na.rm=F,color='blue')+  ylim(.1,.4))

chtx=cht[cht$Nucleotide%in%"C",]
print(ggplot(chtx,aes(y=Percent,x=distance,line=Nucleotide,color=Nucleotide))+
                 xlab("Position of Window\n(Distance from 5'Clip peak)")+ylab('Nucleotide content (C)')+theme_classic()+ggtitle(paste0('window size=',ch,' - upstream 100nt'))+
      geom_line(data = GRandom_C,color='grey',alpha=0.05)+
      geom_line(data=chtx,na.rm=F,color='green')+  ylim(.1,.4))
      
      
      write.table(chucnkseqA, file=paste0(outdir_UP,'/chucnkseqA_window',ch,'.txt'), quote=F, sep="\t", row.names=F, col.names=T)
      write.table(chucnkseqU, file=paste0(outdir_UP,'/chucnkseqU_window',ch,'.txt'), quote=F, sep="\t", row.names=F, col.names=T)
      write.table(chucnkseqG, file=paste0(outdir_UP,'/chucnkseqG_window',ch,'.txt'), quote=F, sep="\t", row.names=F, col.names=T)
      write.table(chucnkseqC, file=paste0(outdir_UP,'/chucnkseqC_window',ch,'.txt'), quote=F, sep="\t", row.names=F, col.names=T)
    
      remove(chucnkseqA,chucnkseqU,chucnkseqG,chucnkseqC)  
      
      
    }

Downstream

##```{r , include=T,fig.asp=1.25, fig.align='center',fig.height=4, fig.width=3}
    
    # downstream   

    for (ch in c(10,20,30,40,50)) {
      chadj=ch-1
    
    chucnkseq_dnA=as.data.frame(matrix(nrow=nrow(checkout2_dn),ncol=(nchar(checkout2_dn[1,'sequence'])-chadj)))
    rownames(chucnkseq_dnA)=checkout2_dn$ID;colnames(chucnkseq_dnA)=seq(1,(nchar(checkout2_dn[1,'sequence'])-chadj))
    chucnkseq_dnU=as.data.frame(matrix(nrow=nrow(checkout2_dn),ncol=(nchar(checkout2_dn[1,'sequence'])-chadj)))
    rownames(chucnkseq_dnU)=checkout2_dn$ID;colnames(chucnkseq_dnU)=seq(1,(nchar(checkout2_dn[1,'sequence'])-chadj))
    chucnkseq_dnG=as.data.frame(matrix(nrow=nrow(checkout2_dn),ncol=(nchar(checkout2_dn[1,'sequence'])-chadj)))
    rownames(chucnkseq_dnG)=checkout2_dn$ID;colnames(chucnkseq_dnG)=seq(1,(nchar(checkout2_dn[1,'sequence'])-chadj))
    chucnkseq_dnC=as.data.frame(matrix(nrow=nrow(checkout2_dn),ncol=(nchar(checkout2_dn[1,'sequence'])-chadj)))
    rownames(chucnkseq_dnC)=checkout2_dn$ID;colnames(chucnkseq_dnC)=seq(1,(nchar(checkout2_dn[1,'sequence'])-chadj))
    
    for (l in 1:(nchar(checkout2_dn[1,'sequence'])-chadj)) {
      ss= substr(checkout2_dn[,'sequence'], start = l, stop = (l+chadj))
      chucnkseq_dnA[,l]=str_count(ss, "A") /ch
      chucnkseq_dnU[,l]=str_count(ss, "U") /ch
      chucnkseq_dnG[,l]=str_count(ss, "G") /ch
      chucnkseq_dnC[,l]=str_count(ss, "C") /ch
    }

   chucnkseq_dnG=chucnkseq_dnG[order(rowMaxs(as.matrix(chucnkseq_dnG)),decreasing = T),]
        

      # NucHeatPlot(chucnkseq_dnA,checkout2_dn,chucnkseq_dnG,"A",'downstream')
      # NucHeatPlot(chucnkseq_dnU,checkout2_dn,chucnkseq_dnG,"U",'downstream')
      # NucHeatPlot(chucnkseq_dnG,checkout2_dn,chucnkseq_dnG,"G",'downstream')
      # NucHeatPlot(chucnkseq_dnC,checkout2_dn,chucnkseq_dnG,"C",'downstream')

      cht=cbind((colMeans(chucnkseq_dnA)),(colMeans(chucnkseq_dnU)),(colMeans(chucnkseq_dnC)),(colMeans(chucnkseq_dnG)))
colnames(cht)=c('A','U','C','G')
cht=melt(cht)
colnames(cht)=c('distance','Nucleotide','Percent')

####### plot CLIP peak data
# print(ggplot(cht,aes(y=Percent,x=distance,line=Nucleotide,color=Nucleotide))+
#                  xlab("Position of Window\n(Distance from 5'Clip peak)")+ylab('Nucleotide content ')+theme_classic()+ggtitle(paste0('window size=',ch,' - downstream 100nt'))+
#       geom_line(data=cht,na.rm=F)+  ylim(.1,.4))

########### Import Random data
    
      GRandom_A=as.matrix(read.delim(paste0(inRandom,'/Random1/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/downstream/chucnkseqA_window',ch,'.txt'),comment.char = "%",header = T,sep = "\t",stringsAsFactors = F))
                        colnames(GRandom_A)=seq(1,(nchar(checkout2_dn[1,'sequence'])-chadj))

      GRandom_U=as.matrix(read.delim(paste0(inRandom,'/Random1/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/downstream/chucnkseqU_window',ch,'.txt'),comment.char = "%",header = T,sep = "\t",stringsAsFactors = F))
                              colnames(GRandom_U)=seq(1,(nchar(checkout2_dn[1,'sequence'])-chadj))  
                              
      GRandom_G=as.matrix(read.delim(paste0(inRandom,'/Random1/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/downstream/chucnkseqG_window',ch,'.txt'),comment.char = "%",header = T,sep = "\t",stringsAsFactors = F))
                              colnames(GRandom_G)=seq(1,(nchar(checkout2_dn[1,'sequence'])-chadj)) 
      
       GRandom_C=as.matrix(read.delim(paste0(inRandom,'/Random1/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/downstream/chucnkseqC_window',ch,'.txt'),comment.char = "%",header = T,sep = "\t",stringsAsFactors = F))
                              colnames(GRandom_C)=seq(1,(nchar(checkout2_dn[1,'sequence'])-chadj))                                                
                              
for (rand in c(2:12)) {
  GRandom2_Ax=as.matrix(read.delim(paste0(inRandom,'/Random',rand,'/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/downstream/chucnkseqA_window',ch,'.txt'),comment.char = "%",header = T,sep = "\t",stringsAsFactors = F))
                        colnames(GRandom2_Ax)=seq(1,(nchar(checkout2_dn[1,'sequence'])-chadj))
                         GRandom_A=rbind(GRandom_A,GRandom2_Ax)
      
  GRandom2_Ux=as.matrix(read.delim(paste0(inRandom,'/Random',rand,'/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/downstream/chucnkseqU_window',ch,'.txt'),comment.char = "%",header = T,sep = "\t",stringsAsFactors = F))
                        colnames(GRandom2_Ux)=seq(1,(nchar(checkout2_dn[1,'sequence'])-chadj))
                         GRandom_U=rbind(GRandom_U,GRandom2_Ux)
                         
  GRandom2_Gx=as.matrix(read.delim(paste0(inRandom,'/Random',rand,'/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/downstream/chucnkseqG_window',ch,'.txt'),comment.char = "%",header = T,sep = "\t",stringsAsFactors = F))
                        colnames(GRandom2_Gx)=seq(1,(nchar(checkout2_dn[1,'sequence'])-chadj))
                         GRandom_G=rbind(GRandom_G,GRandom2_Gx)
                         
  GRandom2_Cx=as.matrix(read.delim(paste0(inRandom,'/Random',rand,'/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/downstream/chucnkseqC_window',ch,'.txt'),comment.char = "%",header = T,sep = "\t",stringsAsFactors = F))
                        colnames(GRandom2_Cx)=seq(1,(nchar(checkout2_dn[1,'sequence'])-chadj))
                         GRandom_C=rbind(GRandom_C,GRandom2_Cx)
  
}
      
                colnames(GRandom_A)=seq(1,(nchar(checkout2_dn[1,'sequence'])-chadj))
            GRandom_A_avg=colMeans(GRandom_A)    
                GRandom_A= melt(t(GRandom_A))
                GRandom_A$X2=as.character(GRandom_A$X2)
                colnames(GRandom_A)=c('distance','Nucleotide','Percent')

                colnames(GRandom_U)=seq(1,(nchar(checkout2_dn[1,'sequence'])-chadj))
            GRandom_U_avg=colMeans(GRandom_U)    
                GRandom_U= melt(t(GRandom_U))
                GRandom_U$X2=as.character(GRandom_U$X2)
                colnames(GRandom_U)=c('distance','Nucleotide','Percent')
      
                colnames(GRandom_G)=seq(1,(nchar(checkout2_dn[1,'sequence'])-chadj))
            GRandom_G_avg=colMeans(GRandom_G)    
                GRandom_G= melt(t(GRandom_G))
                GRandom_G$X2=as.character(GRandom_G$X2)
                colnames(GRandom_G)=c('distance','Nucleotide','Percent')
                
                colnames(GRandom_C)=seq(1,(nchar(checkout2_dn[1,'sequence'])-chadj))
            GRandom_C_avg=colMeans(GRandom_C)    
                GRandom_C= melt(t(GRandom_C))
                GRandom_C$X2=as.character(GRandom_C$X2) 
                colnames(GRandom_C)=c('distance','Nucleotide','Percent')

                                
 Rht=cbind(((GRandom_A_avg)),((GRandom_U_avg)),((GRandom_C_avg)),((GRandom_G_avg)))
colnames(Rht)=c('A','U','C','G')
Rht=melt(Rht)
colnames(Rht)=c('distance','Nucleotide','Percent')

####### plot Random Averages
# print(ggplot(Rht,aes(y=Percent,x=distance,line=Nucleotide,color=Nucleotide))+
#                  xlab("Position of Window\n(Distance from 5'Clip peak)")+ylab('Nucleotide content')+theme_classic()+ggtitle(paste0('Random Sequenc:window size=',ch,' - downstream 100nt'))+
#       geom_line(na.rm=F))


####### plot all nuc with all random
# print(ggplot(cht,aes(y=Percent,x=distance,line=Nucleotide,color=Nucleotide))+
#                  xlab("Position of Window\n(Distance from 5'Clip peak)")+ylab('Nucleotide content')+theme_classic()+ggtitle(paste0('window size=',ch,' - downstream 100nt'))+
#       geom_line(data = GRandom_A,color='red',alpha=0.05)+
#       geom_line(data = GRandom_U,color='purple',alpha=0.05)+
#       geom_line(data = GRandom_G,color='blue',alpha=0.05)+
#       geom_line(data = GRandom_C,color='green',alpha=0.05)+
#       geom_line(data=cht,na.rm=F))


### plot CLIP avg with Random AVG
print(ggplot(cht,aes(y=Percent,x=distance,line=Nucleotide,color=Nucleotide))+
                 xlab("Position of Window\n(Distance from 5'Clip peak)")+ylab('Nucleotide content ')+theme_classic()+ggtitle(paste0('window size=',ch,' - downstream 100nt'))+
      geom_line(data=cht,na.rm=F)+
        geom_line(data=Rht,na.rm=F,alpha=0.25,size=2)+  ylim(.1,.4))

################# PLOT INDIVIDUAL Nuc with Random backgound
chtx=cht[cht$Nucleotide%in%"A",]
print(ggplot(chtx,aes(y=Percent,x=distance,line=Nucleotide,color=Nucleotide))+
                 xlab("Position of Window\n(Distance from 5'Clip peak)")+ylab('Nucleotide content (A)')+theme_classic()+ggtitle(paste0('window size=',ch,' - downstream 100nt'))+
      geom_line(data = GRandom_A,color='grey',alpha=0.05)+
      geom_line(data=chtx,na.rm=F,color='red')+  ylim(.1,.4))

chtx=cht[cht$Nucleotide%in%"U",]
print(ggplot(chtx,aes(y=Percent,x=distance,line=Nucleotide,color=Nucleotide))+
                 xlab("Position of Window\n(Distance from 5'Clip peak)")+ylab('Nucleotide content (U)')+theme_classic()+ggtitle(paste0('window size=',ch,' - downstream 100nt'))+
      geom_line(data = GRandom_U,color='grey',alpha=0.05)+
      geom_line(data=chtx,na.rm=F,color='purple')+  ylim(.1,.4))

chtx=cht[cht$Nucleotide%in%"G",]
print(ggplot(chtx,aes(y=Percent,x=distance,line=Nucleotide,color=Nucleotide))+
                 xlab("Position of Window\n(Distance from 5'Clip peak)")+ylab('Nucleotide content (G)')+theme_classic()+ggtitle(paste0('window size=',ch,' - downstream 100nt'))+
      geom_line(data = GRandom_G,color='grey',alpha=0.05)+
      geom_line(data=chtx,na.rm=F,color='blue')+  ylim(.1,.4))

chtx=cht[cht$Nucleotide%in%"C",]
print(ggplot(chtx,aes(y=Percent,x=distance,line=Nucleotide,color=Nucleotide))+
                 xlab("Position of Window\n(Distance from 5'Clip peak)")+ylab('Nucleotide content (C)')+theme_classic()+ggtitle(paste0('window size=',ch,' - downstream 100nt'))+
      geom_line(data = GRandom_C,color='grey',alpha=0.05)+
      geom_line(data=chtx,na.rm=F,color='green')+  ylim(.1,.4))
      
    
    write.table(chucnkseq_dnA, file=paste0(outdir_DN,'/chucnkseq_dnA_window',ch,'.txt'), quote=F, sep="\t", row.names=T, col.names=T)
    write.table(chucnkseq_dnU, file=paste0(outdir_DN,'/chucnkseq_dnU_window',ch,'.txt'), quote=F, sep="\t", row.names=T, col.names=T)
    write.table(chucnkseq_dnG, file=paste0(outdir_DN,'/chucnkseq_dnG_window',ch,'.txt'), quote=F, sep="\t", row.names=T, col.names=T)
    write.table(chucnkseq_dnC, file=paste0(outdir_DN,'/chucnkseq_dnC_window',ch,'.txt'), quote=F, sep="\t", row.names=T, col.names=T)
    
    
    
    
    
    }

Nucleotide Content Heatmap

Heat map is clustered by %G content (first plot)
All heat maps are in same row order
Line plots show average nucleotide content across upstream/downstream region for each cluster
First heat plot is annotated with G-quadruplex score

                draw(ht_list, ht_gap = unit(.5, "in"),main_heatmap = "matG",column_title = "Distance from CLIP peak",column_title_side="bottom")

for (clst in 1:length(op)) {
  
  opn2=rownames(chucnkseqG_UPDN)[unlist(op[[clst]])]
     
     opavg=cbind(colMeans(chucnkseqA_UPDN[opn2,]),colMeans(chucnkseqU_UPDN[opn2,]),colMeans(chucnkseqC_UPDN[opn2,]),colMeans(chucnkseqG_UPDN[opn2,]))
     colnames(opavg)=c('A','U','C','G')
     opavg=melt(opavg)
     colnames(opavg)=c("distance","nucleotide","value")
     
     print(ggplot(opavg,aes(y=value,x=distance,line=nucleotide,color=nucleotide))+
                 xlab("Position of Window\n(Distance from 5'Clip peak)")+ylab('Nucleotide content ')+theme_classic()+ggtitle(paste0('Cluster ',clst,': window size=',ch,' - upstream/downstream 100nt'))+
      geom_line(na.rm=F)+ylim(.0,.6) 
      )
   }

      ## Distance of Splicejunction from CLIP start
#######################################       
#### Distance of Splicejunction from CLIP start       
#######################################       
if (CLIPtype=='Introns'){name='Intron'}
   if (CLIPtype=='Exons'){name='Exon'}

       
      GexnRando=as.matrix(read.delim(paste0(inRandom,'/Random1/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/upstream/Table/Col_exon_UP.txt'),comment.char = "%",header = F,sep = "\t",stringsAsFactors = F))
            for (rand in c(2:12)) {

      GexnRando2=as.matrix(read.delim(paste0(inRandom,'/Random',rand,'/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/upstream/Table/Col_exon_UP.txt'),comment.char = "%",header = F,sep = "\t",stringsAsFactors = F))
      GexnRando=as.data.frame(cbind(GexnRando,GexnRando2),stringsAsFactors = F);
            }
      GexnRando=GexnRando[is.na(GexnRando$V1)==F,]

      
######################
      GtnsxRando=as.matrix(read.delim(paste0(inRandom,'/Random1/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/upstream/Table/Col_transcript_UP.txt'),comment.char = "%",header = F,sep = "\t",stringsAsFactors = F))
      
            for (rand in c(2:12)) {
      GtnsxRando2=as.matrix(read.delim(paste0(inRandom,'/Random',rand,'/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/upstream/Table/Col_transcript_UP.txt'),comment.char = "%",header = F,sep = "\t",stringsAsFactors = F))
      GtnsxRando=as.data.frame(cbind(GtnsxRando,GtnsxRando2),stringsAsFactors = F);
            }
      GtnsxRando=GtnsxRando[is.na(GtnsxRando$V1)==F,]

######################
      GidRando=as.matrix(read.delim(paste0(inRandom,'/Random1/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/upstream/Table/Col_ID_UP.txt'),comment.char = "%",header = F,sep = "\t",stringsAsFactors = F))
                  
      for (rand in c(2:12)) {
      GidRando2=as.matrix(read.delim(paste0(inRandom,'/Random',rand,'/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/upstream/Table/Col_ID_UP.txt'),comment.char = "%",header = F,sep = "\t",stringsAsFactors = F))
      GidRando=as.data.frame(cbind(GidRando,GidRando2),stringsAsFactors = F);
                  }
      GidRando=GidRando[is.na(GidRando$V1)==F,]
######################
      
      
        exons=mm10_USE[mm10_USE$transcript_id%in%canonical$transcript,]
      
exons.GR=GRanges(seqnames=exons$seqname, ranges=IRanges(start=exons$start,end=exons$end),
strand=exons$strand,transcript_id=exons$transcript_id,exon_number=exons$exon_number)

ExonDist=as.data.frame(matrix(nrow=nrow(GidRando),ncol=ncol(GidRando)))
ExonDist_dn=as.data.frame(matrix(nrow=nrow(GidRando),ncol=ncol(GidRando)))
ExonTransc=as.data.frame(matrix(nrow=nrow(GidRando),ncol=ncol(GidRando)))
      # for (col in 1:ncol(GidRando)) {
      for (col in 1:1) {
                  id=GidRando[,col]
                  trnx=GtnsxRando[,col]
                  exn=GexnRando[,col]
                  RandInfo=as.data.frame(cbind(id,trnx,exn),stringsAsFactors = F)
                      RandInfo$exn=as.numeric(RandInfo$exn)
                  RandInfo=separate(RandInfo,id,into = c('start','strand'),sep="_",remove = F)    
                  RandInfo=separate(RandInfo,start,into = c('chr','start'),sep=":",remove = F)    
                      RandInfo$start=as.numeric(RandInfo$start)
              for (r in 1:length(id)) {
                    RandInfoR=RandInfo[r,]

                 RandInfoTrans= mm10_USE[mm10_USE$transcript_id%in%RandInfo[r,'trnx']&mm10_USE$exon_number%in%as.numeric(RandInfo[r,'exn']),]
                 
                if (nrow(RandInfoTrans)>1) {toomannyrows}
        
    
                  
       if (unique(RandInfoR$strand)=="+") {ExonDist[r,col]=RandInfoR$start-RandInfoTrans$start
                                    ExonDist_dn[r,col]=(RandInfoTrans$end-RandInfoR$start)}
                  
       if (unique(RandInfoR$strand)=="-") {ExonDist[r,col]=(RandInfoTrans$end-RandInfoR$start)
                                    ExonDist_dn[r,col]=(RandInfoR$start-RandInfoTrans$start)}
        
                  ExonTransc[r,col]=RandInfoR$trnx
        # peaks_Anno_mpile=cbind(qh,sh)
        # peaks_Anno_mpile=merge(peaks2,peaks_Anno_mpile,by.x="ID",by.y="ID_peaks",all.x=T)

              }
                      print(col)
      }

    # ExonDistx=ExonDist  
# ExonDist=ExonDistx
ExonDist=ExonDist[is.na(ExonDist[1,])==F,]

      R=melt(ExonDist);colnames(R)[2]="ExonDist"
      R$variable=as.character(R$variable)
      
     
       
      checkout2$upstream_exon_dist=NA
      checkout2_dn$downstream_exon_dist=NA
      for (r in 1:nrow(checkout2)) {

         D=checkout2[r,]
         M=mm10_USE[(mm10_USE$transcript_id%in%D$transcript_id)&(mm10_USE$exon_number%in%D$`exon#`),]
        colnames(M)=paste0(colnames(M),"_mm10")
       
        if (unique(D$strand)=="+") {checkout2[r,'upstream_exon_dist']=unique(((D$crosslink-M$start_mm10)))
                             checkout2_dn[r,'downstream_exon_dist']=unique(max((M$end_mm10-D$crosslink)))}
                  
       if (unique(D$strand)=="-") {checkout2[r,'upstream_exon_dist']=unique(((M$end_mm10-D$crosslink)))
                                    checkout2_dn[r,'downstream_exon_dist']=unique(((D$crosslink-M$start_mm10)))}      
      }
      
 D=checkout2 
 D=(D[,'upstream_exon_dist',drop=F])
 colnames(D)='ExonDist'
 D$variable='CLIP_Exon'
ggplot(R,aes(x=ExonDist,line=variable))+
   geom_density(data=R,size= .6,color='grey') +
  geom_density(data=D,size= .6,color='red')+
  theme_classic()+ylab("Density")+xlab("Upstream Exon junction Distance from 5' CLIP peak (nt)")+ggtitle("Distance of 5' CLIP peak (nt) from Upstream Exon Junction")+
  scale_x_continuous(trans = 'log10')
       #geom_vline(xintercept=nrow(checkout2[is.na(checkout2$qgrs_maxScore)==F,]),colour="red") +xlim(0,100)


D=checkout2_dn 
 D=(D[,'downstream_exon_dist',drop=F])
 colnames(D)='ExonDist'
 D$variable='CLIP_Exon'
ggplot(R,aes(x=ExonDist,line=variable))+
   geom_density(data=R,size= .6,color='grey') +
  geom_density(data=D,size= .6,color='red')+
  theme_classic()+ylab("Density")+xlab("Downstream Exon junction Distance from 5' CLIP peak (nt)")+ggtitle("Distance of 5' CLIP peak (nt) from Downstream Exon Junction")+
  scale_x_continuous(trans = 'log10')
       #geom_vline(xintercept=nrow(checkout2[is.na(checkout2$qgrs_maxScore)==F,]),colour="red") +xlim(0,100)

CLIP peak Local Structure - Basepairing Probability

All possible Base Pairing probabilities were calculated for the 100nt upstream/downstream region of Intronic CLIP peaks.

"RNAfold -p -i CLIPsequences.fa > out.fold"

The base pairing probabilities were then averaged for all possible pairings in a 10nt sliding window. Base paring probabilities are taken from RNAfold_out.dp.ps

Gibbs free energy for each 100 nt Sequence was also calculated and taken from out.fold

      ####################################################################
      #### Upstream Base Pairing Probability
      ####################################################################

    system(paste0('rm -f ',scratch_UP,'/*.ps'),intern = F)
    system(paste0('rm -f ',scratch_UP,'/out.fold'),intern = F)
    
      # system(paste0('RNAfold -p -i ',outdir_UP,'/AllLocation_Random.fa > out.fold'),intern = F)
      system(paste0(RNAfold,' -p -i ',outdir_UP,'/AllLocation_Random.fa > out.fold'),intern = F)
      system(paste0('mv *.ps ',scratch_UP),intern = F)
      system(paste0('mv out.fold ',scratch_UP),intern = F)
      
      
      #### get deltaG
      enrg=read.delim(paste0(scratch_UP,'/out.fold'),comment.char = "%",header = F,sep = "\t",stringsAsFactors = F)
    
      for (e in 1:nrow(checkout2)) {
        s=as.character(checkout2[e,'n'])
        s=gsub("\\(-)","",s)
        s=gsub("\\(+)","",s)
        loc=grep(s,enrg$V1)
          if (isEmpty(loc)) {next ;xxx  }
      enrgx=enrg[c(loc:(loc+5)),]
      enrgx=as.data.frame(enrgx[4])
      enrgx=separate(enrgx,col=1,into = c('seq','DG'),window+2)
      enrgx$DG=gsub("]","",enrgx$DG)
      checkout2[e,'DeltaG']=as.numeric(enrgx$DG)
      }
      

      #### get BPP

      fls=list.files(paste0(scratch_UP,'/'), pattern="dp.ps", all.files=FALSE,full.names=FALSE)
      fls=paste0(scratch_UP,'/',fls)
      fls=as.list(fls)

      # xxx=gsub("\\(","\\\\(",fls)
      # xxx=str_replace_all(fls,"\\(","/\\(")
      # str_replace_all(xxx,"/","\\\\")
      

      out=mclapply(fls,RNAfold_output,mc.cores=UseCores)
      out2=out[[1]][,1:2,drop=F]
      out2_max=out[[1]][,1:2,drop=F]
      for (o in 1:length(out)) {
        if (nrow(out[[o]][,3,drop=F])==0) {
          out2$Xx=0;colnames(out2)[colnames(out2)%in%"Xx"]=colnames(out[[o]][,3,drop=F])
          out2_max$Xx=0;colnames(out2_max)[colnames(out2_max)%in%"Xx"]=colnames(out[[o]][,4,drop=F])
        }
        if (nrow(out[[o]][,3,drop=F])>0) {
          out2=cbind(out2,out[[o]][,3,drop=F])
          out2_max=cbind(out2_max,out[[o]][,4,drop=F])
        }
      }

      system(paste0('rm -f ',scratch_UP,'/*.ps'))
      system(paste0('rm -f ',scratch_UP,'/*.fa'))
      system(paste0('rm -f ',scratch_UP,'/*.bed'))
      system(paste0('rm -f ',scratch_UP,'/out.dn.fold'))

##################################################################################
      ph=as.matrix(out2[,-c(1:2)])
      ph=ph[is.na(ph[,3])==F,]
      ph[is.nan(as.matrix(ph))==T]=0

      ph=t(ph)
      ph=as.data.frame(ph)

      #### sort by row diff
      ph=ph[order(rowMaxs(as.matrix(ph)),decreasing = T),]
      colnames(ph)=c(-window:(ncol(ph)-(window+1)))
      ph= cbind(rowMaxs(as.matrix(ph)),ph);colnames(ph)[1]='diff'
      
      
      
      ph$n=rownames(ph)
      ph$n=gsub(paste0(scratch_UP,'/'),"",ph$n)
      ph$n=gsub("_",":",ph$n)
            rownames(ph)=ph$n

      checkout2=merge(checkout2,ph[,c("n","diff"),drop=F],by='n',all.x=T)
      colnames(checkout2)[colnames(checkout2)%in%'diff']='RowMax_MeanChuckBPP'
      
      
      #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####
      #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####

      ph_max=as.matrix(out2_max[,-c(1:2)])
      ph_max=ph_max[is.na(ph_max[,3])==F,]
      ph_max[is.nan(as.matrix(ph_max))==T]=0

      ph_max=t(ph_max)
      ph_max=as.data.frame(ph_max)
      
      
      #### sort by row diff
      ph_max=ph_max[order(rowMaxs(as.matrix(ph_max)),decreasing = T),]
      colnames(ph_max)=c(-window:(ncol(ph_max)-(window+1)))
      ph_max= cbind(rowMaxs(as.matrix(ph_max)),ph_max);colnames(ph_max)[1]='diff'

      
      ph_max$n=rownames(ph_max)
      ph_max$n=gsub(paste0(scratch_UP,'/'),"",ph_max$n)
      ph_max$n=gsub("_",":",ph_max$n)
      rownames(ph_max)=ph_max$n
      
      checkout2=merge(checkout2,ph_max[,c("n","diff"),drop=F],by='n',all.x=T)
      colnames(checkout2)[colnames(checkout2)%in%'diff']='RowMax_MaxChuckBPP'
      
      
      
  
      
      ####################################################################
      #### downstream Base Pairing Probability
      ####################################################################
      ### High base pair probability chr8:18628170-18628210
      ### low basepair probibility chr9:72581297-72581336

      system(paste0('rm -f ',scratch_DN,'/*.ps'))
      system(paste0('rm -f ',scratch_DN,'/out.dn.fold'))
      
      system(paste0(RNAfold,' -p -i ',outdir_DN,'/AllLocation_DN_Random.fa > out.dn.fold'),intern=F)
      system(paste0('mv *.ps ',scratch_DN))
      system(paste0('mv out.dn.fold ',scratch_DN),intern = F)
      
      energDN=read.delim(paste0(scratch_DN,'/out.dn.fold'),comment.char = "%",header = F,sep = "\t",stringsAsFactors = F)
       
      for (e in 1:nrow(checkout2_dn)) {
        s=as.character(checkout2_dn[e,'n'])
        s=gsub("\\(-)","",s)
        s=gsub("\\(+)","",s)
        loc=grep(s,energDN$V1)
        if (isEmpty(loc)) {next ;xxx  }
        
        energDNx=energDN[c(loc:(loc+5)),]
        energDNx=as.data.frame(energDNx[4])
        energDNx=separate(energDNx,col=1,into = c('seq','DG'),window+2)
        energDNx$DG=gsub("]","",energDNx$DG)
        checkout2_dn[e,'DeltaG']=as.numeric(energDNx$DG)
      }
    
      
      
      flsdn=list.files(paste0(scratch_DN,'/'), pattern="dp.ps", all.files=FALSE, full.names=FALSE)
      flsdn=paste0(scratch_DN,'/',flsdn)
      
      flsdn= as.list(flsdn)

      outdn=mclapply((flsdn),RNAfold_output,mc.cores=UseCores)
      outdn2=outdn[[1]][,1:2,drop=F]
      outdn2_max=outdn[[1]][,1:2,drop=F]
      for (o in 1:length(outdn)) {
        if (nrow(outdn[[o]][,3,drop=F])==0) {
          outdn2$Xx=0;colnames(outdn2)[colnames(outdn2)%in%"Xx"]=colnames(outdn[[o]][,3,drop=F])
          outdn2_max$Xx=0;colnames(outdn2_max)[colnames(outdn2_max)%in%"Xx"]=colnames(outdn[[o]][,4,drop=F])
        }
        if (nrow(outdn[[o]][,3,drop=F])>0) {
          outdn2=cbind(outdn2,outdn[[o]][,3,drop=F])
          outdn2_max=cbind(outdn2_max,outdn[[o]][,4,drop=F])
        }
      }

      system(paste0('rm -f ',scratch_DN,'/*.ps'))
      system(paste0('rm -f ',scratch_DN,'/*.fa'))
      system(paste0('rm -f ',scratch_DN,'/*dn.bed'))
      system(paste0('rm -f ',scratch_DN,'/out.dn.fold'))

      # #########################################
      phdn=as.matrix(outdn2[,-c(1:2)])
      phdn=phdn[is.na(phdn[,3])==F,]
      phdn[is.nan(as.matrix(phdn))==T]=0
      phdn=t(phdn)
      phdn=as.data.frame(phdn)

      #### sort by row diff
      phdn=phdn[order(rowMaxs(as.matrix(phdn)),decreasing = T),]
      colnames(phdn)=c(((window+1)-(ncol(phdn))):window)
      phdn= cbind(rowMaxs(as.matrix(phdn)),phdn);colnames(phdn)[1]='diff'

      
      
      phdn$n=rownames(phdn)
      phdn$n=gsub(paste0(scratch_DN,'/'),"",phdn$n)
      phdn$n=gsub("_",":",phdn$n)
                  rownames(phdn)=phdn$n

      checkout2_dn=merge(checkout2_dn,phdn[,c("n","diff"),drop=F],by='n',all.x=T)
      colnames(checkout2_dn)[colnames(checkout2_dn)%in%'diff']='RowMax_MeanChuckBPP'

      # #########################################
      # #########################################
      phdn_max=as.matrix(outdn2_max[,-c(1:2)])
      phdn_max=phdn_max[is.na(phdn_max[,3])==F,]
      phdn_max[is.nan(as.matrix(phdn_max))==T]=0
      phdn_max=t(phdn_max)
      phdn_max=as.data.frame(phdn_max)

      #### sort by row diff
      phdn_max=phdn_max[order(rowMaxs(as.matrix(phdn_max)),decreasing = T),]
      colnames(phdn_max)=c(((window+1)-(ncol(phdn_max))):window)
      phdn_max= cbind(rowMaxs(as.matrix(phdn_max)),phdn_max);colnames(phdn_max)[1]='diff'
      
     
      phdn_max$n=rownames(phdn_max)
            phdn_max$n=gsub(paste0(scratch_DN,'/'),"",phdn_max$n)
            phdn_max$n=gsub("_",":",phdn_max$n)
                        rownames(phdn_max)=phdn_max$n

            
      checkout2_dn=merge(checkout2_dn,phdn_max[,c("n","diff"),drop=F],by='n',all.x=T)
      colnames(checkout2_dn)[colnames(checkout2_dn)%in%'diff']='RowMax_MaxChuckBPP'
      

  

  rownames(outchunck_rowaverage)=c(1:nrepeat)
  # colnames(outchunck_rowaverage)=c(-window:-1)
  colnames(outchunck_rowaverage)=c(10:window)

Basepairing probability Heatmap

Upstream heatmap was sorted by maximum average Base pairing probability for each Row.

Downstream heat map is sorted by same order as Upstream heatmap.

Heat map is split by Upstream sequences with row max BPP > 0.2.

Heatmaps are annotated with each sequences:
DeltaG – Gibbs free energy for 100nt region
Gquad_score- G-quad score determined by QGRS if G-quad is identified
IntronDist- distance (nt) of CLIP peak from corresponding Upstream/Downstream Intron junction

  checkout2_HM=merge(checkout2,Peaksdata4[,c('ID','Counts_Unique','Counts_Multimappers_All','Counts_Multimappers_Scaled','Counts_Multimappers_BestMapping')],by.y="ID",by.x="CLIP_ID")
  checkout2_dn_HM=merge(checkout2_dn,Peaksdata4[,c('ID','Counts_Unique','Counts_Multimappers_All','Counts_Multimappers_Scaled','Counts_Multimappers_BestMapping')],by.y="ID",by.x="CLIP_ID")
########################################################################################################
  ################################################################################################################################################

   
hm=as.matrix(ph[,colnames(ph)%in%c('diff','n')==F])
      hmp=hm
      Gname=as.matrix(colnames(hm))
     Gname[,1]=NA
     Gname[which(colnames(hm)%in%colnames(hmp)[seq(1,ncol(hm),15)]),1]=colnames(hm)[seq(1,ncol(hm),15)]
     Gname=as.numeric(Gname)
     Gname[which(is.na(Gname))]=""
     
     ########Z
     hm=as.matrix(ph[,colnames(ph)%in%c('n')==F])

      nucmap=hm
      DGAnno_UP=checkout2_HM
      
      rownames(DGAnno_UP)=DGAnno_UP$n
      DGAnno_UP=DGAnno_UP[rownames(nucmap),c('DeltaG','CLIP_ID','qgrs_maxScore','Counts_Unique','Counts_Multimappers_Scaled','Counts_Multimappers_All'),drop=F]
      # setdiff(c('DeltaG','CLIP_ID','qgrs_maxScore','upstream_exon_dist','Counts_Unique','Counts_Multimappers_Scaled','Counts_Multimappers_All'),colnames('DGAnno_UP'))

            hm=merge(DGAnno_UP,hm,by="row.names")
            hm=hm[order(hm$diff,decreasing = T),]
            rownames(hm)=hm[,'CLIP_ID']
                  split=as.matrix(hm$diff<meanBPP)    

      DGAnno_UP=hm[,colnames(hm)%in%c('Row.names','DeltaG','CLIP_ID','upstream_exon_dist','qgrs_maxScore','diff','Counts_Unique','Counts_Multimappers_Scaled','Counts_Multimappers_All')]
      hm=as.matrix(hm[,colnames(hm)%in%c('Row.names','DeltaG','CLIP_ID','qgrs_maxScore','upstream_exon_dist','diff','Counts_Unique','Counts_Multimappers_Scaled','Counts_Multimappers_All')==F])
            
      DGAnno_UP$DeltaG=-as.numeric(DGAnno_UP$DeltaG)
      # DGAnno_UP=rowAnnotation(DeltaG = row_anno_barplot(as.numeric(DGAnno_UP$DeltaG)))
      Anno_UP=rowAnnotation( Gibbs_energy = row_anno_barplot(as.numeric(DGAnno_UP$DeltaG)),
                             Gquad_score = row_anno_barplot(as.numeric(DGAnno_UP$qgrs_maxScore)),
                             ExonDistance = row_anno_barplot(as.numeric(DGAnno_UP$upstream_exon_dist)),
                             # Counts_UniqueReads = row_anno_barplot(as.numeric(DGAnno_UP$Counts_Unique)),
                             Counts_ScaledMM = row_anno_barplot(as.numeric(DGAnno_UP$Counts_Multimappers_Scaled)),
                             # Counts_AllMM = row_anno_barplot(as.numeric(DGAnno_UP$Counts_Multimappers_All)),
                             annotation_name_rot=270)
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
      col_fun = colorRamp2(c(0, .2, .4), c("blue", "white", "red"))

 BPP_UP=ComplexHeatmap::Heatmap(hm,col = col_fun,name = "matBPP_UP", 
                                  column_title = "Upstream",column_title_side = "top",    heatmap_legend_param = list(title = "BasePairing\nProbability"),
                                  row_title = paste0(CLIPtype," CLIP peaks\nBasepairing Probability"),row_title_side = "left",
                                show_row_names=F,    row_names_side = "left", row_names_gp = gpar(fontsize = 8),
                                  cluster_columns = F,cluster_rows =F,
                         row_split = split,row_gap =unit(3,"mm"),
                              column_labels = Gname,
                          #left_annotation = GscrAnno_UP,
                         right_annotation = Anno_UP, 
                        cluster_row_slices = FALSE,
                          width = unit(w, "in"),heatmap_height = unit(h, "cm"), column_names_gp = gpar(fontsize = 8)
                          )  
  
 op= row_order(BPP_UP)
## Warning: The heatmap has not been initialized. You might have different results
## if you repeatedly execute this function. It is more suggested to do as
## `ht = draw(ht); row_order(ht)`.
opn=rownames(hm)[unlist(op)]
 



hmDN=as.matrix(phdn[,colnames(phdn)%in%c('diff','n')==F])
      hmDNp=hmDN
      Gname=as.matrix(colnames(hmDN))
     Gname[,1]=NA
     Gname[which(colnames(hmDN)%in%colnames(hmDNp)[seq(1,ncol(hmDN),15)]),1]=colnames(hmDN)[seq(1,ncol(hmDN),15)]
     Gname=as.numeric(Gname)
     Gname[which(is.na(Gname))]=""
     
     ########Z
     hmDN=as.matrix(phdn[,colnames(phdn)%in%c('n')==F])
      nucmap=hmDN
      DGAnno_DN=checkout2_dn_HM
      
      rownames(DGAnno_DN)=DGAnno_DN$n
      # rownames(DGAnno_DN)=DGAnno_DN$CLIP_ID
      DGAnno_DN=DGAnno_DN[rownames(nucmap),c('DeltaG','CLIP_ID','qgrs_maxScore','Counts_Unique','Counts_Multimappers_Scaled','Counts_Multimappers_All'),drop=F]
      
            hmDN=merge(DGAnno_DN,hmDN,by="row.names")
            hmDN=hmDN[order(hmDN$diff,decreasing = T),]
            rownames(hmDN)=hmDN[,'CLIP_ID']
                  split=as.matrix(hmDN$diff<meanBPP)    

      DGAnno_DN=hmDN[,colnames(hmDN)%in%c('Row.names','DeltaG','CLIP_ID','qgrs_maxScore','downstream_exon_dist','diff','Counts_Unique','Counts_Multimappers_Scaled','Counts_Multimappers_All')]
      hmDN=as.matrix(hmDN[,colnames(hmDN)%in%c('Row.names','DeltaG','CLIP_ID','downstream_exon_dist','qgrs_maxScore','diff','Counts_Unique','Counts_Multimappers_Scaled','Counts_Multimappers_All')==F])
            
      DGAnno_DN$DeltaG=-as.numeric(DGAnno_DN$DeltaG)
      # DGAnno_DN=rowAnnotation(DeltaG = row_anno_barplot(as.numeric(DGAnno_DN$DeltaG)))
      Anno_DN=rowAnnotation(DeltaG = row_anno_barplot(as.numeric(DGAnno_DN$DeltaG)),
                            Gquad_score = row_anno_barplot(as.numeric(DGAnno_DN$qgrs_maxScore)),
                            ExonDist = row_anno_barplot(as.numeric(DGAnno_DN$downstream_exon_dist)),
                            # Counts_UniqueReads = row_anno_barplot(as.numeric(DGAnno_DN$Counts_Unique)),
                             Counts_ScaledMM = row_anno_barplot(as.numeric(DGAnno_DN$Counts_Multimappers_Scaled)),
                             # Counts_AllMM = row_anno_barplot(as.numeric(DGAnno_DN$Counts_Multimappers_All)),
                            annotation_name_rot=270)
## Warning in min(x): no non-missing arguments to min; returning Inf

## Warning in min(x): no non-missing arguments to max; returning -Inf
     hmDN=hmDN[opn,]
  BPP_DN=ComplexHeatmap::Heatmap(hmDN,col = col_fun,name = "matBPP_DN", 
                                  column_title = "Downstream",column_title_side = "top",show_heatmap_legend = F,
                                  row_title = paste0(CLIPtype," CLIP peaks"),row_title_side = "left",show_row_names=F, row_names_gp = gpar(fontsize = 8),
                                  cluster_columns = F,cluster_rows =F,
                                   right_annotation = Anno_DN, 
                              column_labels = Gname,
                          width = unit(w, "in"),heatmap_height = unit(h, "cm"), column_names_gp = gpar(fontsize = 8)#,
                                     )
  
  ht_list = BPP_UP + BPP_DN
          
          # BPP_UP
          # BPP_DN
           # draw(ht_list, ht_gap = unit(1, "in"),main_heatmap = "matBPP_UP",column_title = "Distance from CLIP peak",column_title_side="bottom")
                
           draw(ht_list, ht_gap = unit(1, "in"),main_heatmap = "matBPP_UP",column_title = "Distance from CLIP peak",column_title_side="bottom",heatmap_height = unit((20), "cm"),heatmap_width = unit(7, "cm"))
## 'heatmap_width' should not be set in draw() for horizontal heatmap list
## (Note a single heatmap is a horizontal heatmap list). Please directly
## set it in `Heatmap()`.

     op= row_order(BPP_UP)
## Warning: The heatmap has not been initialized. You might have different results
## if you repeatedly execute this function. It is more suggested to do as
## `ht = draw(ht); row_order(ht)`.
for (clst in 1:length(op)) {
     
      opn=rownames(hm)[unlist(op[[clst]])]
 
     
     opavg=cbind(colMeans(hm[opn,c(ncol(hm):1)]),colMeans(hmDN[opn,]))
     colnames(opavg)=c("Upstream","Downstream")
     opavg=melt(opavg)
     colnames(opavg)=c("distance","nucleotide","value")
     
     print(ggplot(opavg,aes(y=value,x=distance,line=nucleotide,color=nucleotide))+
                 xlab("Position of Window\n(Distance from 5'Clip peak)")+ylab('Average basepairing probabilty')+theme_classic()+ggtitle(paste0(clst,' window size=',ch,' - upstream/downstream 100nt'))+
      geom_line(na.rm=F)#+ylim(.0,.6) 
      )
   }

The first heatmap shows all CLIP locations and is split row max BPP > 0.2 (TOP).

The second heatmap shows only the rows with row max BPP > 0.2 and clustered using hierarchical clustering.
The second heatmap is scaled so that it shoud dirrectly be superimposed over the TOP unclusted rows in the first heatmap

robust_dist = function(x, y) {
    qx = quantile(x, c(0.1, 0.9))
    qy = quantile(y, c(0.1, 0.9))
    l = x > qx[1] & x < qx[2] & y > qy[1] & y < qy[2]
    x = x[l]
    y = y[l]
    sqrt(sum((x - y)^2))
}
  
  checkout2_HM=merge(checkout2,Peaksdata4[,c('ID','Counts_Unique','Counts_Multimappers_All','Counts_Multimappers_Scaled','Counts_Multimappers_BestMapping')],by.y="ID",by.x="CLIP_ID")
  checkout2_dn_HM=merge(checkout2_dn,Peaksdata4[,c('ID','Counts_Unique','Counts_Multimappers_All','Counts_Multimappers_Scaled','Counts_Multimappers_BestMapping')],by.y="ID",by.x="CLIP_ID")
########################################################################################################
  ################################################################################################################################################

#### Upstream Heatmap
hm=as.matrix(ph[,colnames(ph)%in%c('diff','n')==F])
      hmp=hm
      Gname=as.matrix(colnames(hm))
     Gname[,1]=NA
     Gname[which(colnames(hm)%in%colnames(hmp)[seq(1,ncol(hm),15)]),1]=colnames(hm)[seq(1,ncol(hm),15)]
     Gname=as.numeric(Gname)
     Gname[which(is.na(Gname))]=""
     
     ########Z
     hm=as.matrix(ph[,colnames(ph)%in%c('n')==F])

      nucmap=hm
      DGAnno_UP=checkout2_HM
      
      rownames(DGAnno_UP)=DGAnno_UP$n
      DGAnno_UP=DGAnno_UP[rownames(nucmap),c('DeltaG','CLIP_ID','qgrs_maxScore','Counts_Unique','Counts_Multimappers_Scaled','Counts_Multimappers_All'),drop=F]
      # setdiff(c('DeltaG','CLIP_ID','qgrs_maxScore','upstream_exon_dist','Counts_Unique','Counts_Multimappers_Scaled','Counts_Multimappers_All'),colnames('DGAnno_UP'))
      
            hm=merge(DGAnno_UP,hm,by="row.names")
            hm=hm[order(hm$diff,decreasing = T),]
            hm=hm[hm$diff>meanBPP,]
            rownames(hm)=hm[,'CLIP_ID']
                  split=as.matrix(hm$diff<meanBPP)    

      DGAnno_UP=hm[,colnames(hm)%in%c('Row.names','DeltaG','CLIP_ID','qgrs_maxScore','diff','Counts_Unique','Counts_Multimappers_Scaled','Counts_Multimappers_All')]
      hm=as.matrix(hm[,colnames(hm)%in%c('Row.names','DeltaG','CLIP_ID','qgrs_maxScore','upstream_exon_dist','diff','Counts_Unique','Counts_Multimappers_Scaled','Counts_Multimappers_All')==F])
            
      DGAnno_UP$DeltaG=-as.numeric(DGAnno_UP$DeltaG)
      # DGAnno_UP=rowAnnotation(DeltaG = row_anno_barplot(as.numeric(DGAnno_UP$DeltaG)))
      Anno_UP=rowAnnotation( Gibbs_energy = row_anno_barplot(as.numeric(DGAnno_UP$DeltaG)),
                             Gquad_score = row_anno_barplot(as.numeric(DGAnno_UP$qgrs_maxScore)),
                             ExonDistance = row_anno_barplot(as.numeric(DGAnno_UP$upstream_exon_dist)),
                             # Counts_UniqueReads = row_anno_barplot(as.numeric(DGAnno_UP$Counts_Unique)),
                             Counts_ScaledMM = row_anno_barplot(as.numeric(DGAnno_UP$Counts_Multimappers_Scaled)),
                             # Counts_AllMM = row_anno_barplot(as.numeric(DGAnno_UP$Counts_Multimappers_All)),
                             annotation_name_rot=270)
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
### Heatmap

      col_fun = colorRamp2(c(0, .2, .4), c("blue", "white", "red"))
      
{     xx= hclust(dist(cor(hm)),method = 'average',members=NULL)
BPP_UP= ComplexHeatmap::Heatmap(hm,col = col_fun,name = "matBPP_UP", 
                                  column_title = "Upstream",column_title_side = "top",
                                heatmap_legend_param = list(title = "BasePairing\nProbability"),
                                  # row_title = paste0(CLIPtype," CLIP peaks\nBasepairing Probability"),row_title_side = "left",
                                show_row_names=F,    row_names_side = "left", row_names_gp = gpar(fontsize = 8),
                                  cluster_columns = F,
                                # cluster_rows =T,
                                # clustering_distance_rows = "pearson",
                                clustering_distance_rows = function(x, y) 1 - cor(x, y,method = 'pearson'),
                                # clustering_distance_rows = robust_dist,
                                # cluster_rows = xx,
                                 row_split = 4,row_gap =unit(3,"mm"),
                              column_labels = F,
                              show_column_names = F,
                          #left_annotation = GscrAnno_UP,
                         right_annotation = Anno_UP, 
                        # cluster_row_slices = FALSE,
                          width = unit(w, "in"),heatmap_height = unit(h, "cm"), column_names_gp = gpar(fontsize = 8)
                          )  
  
      }    
 op= row_order(BPP_UP)
## Warning: The heatmap has not been initialized. You might have different results
## if you repeatedly execute this function. It is more suggested to do as
## `ht = draw(ht); row_order(ht)`.
opn=rownames(hm)[unlist(op)]



####################################################
#### Downstream Heatmap
############################################################
hmDN=as.matrix(phdn[,colnames(phdn)%in%c('diff','n')==F])
      hmDNp=hmDN
      Gname=as.matrix(colnames(hmDN))
     Gname[,1]=NA
     Gname[which(colnames(hmDN)%in%colnames(hmDNp)[seq(1,ncol(hmDN),15)]),1]=colnames(hmDN)[seq(1,ncol(hmDN),15)]
     Gname=as.numeric(Gname)
     Gname[which(is.na(Gname))]=""
     
     ########
     hmDN=as.matrix(phdn[,colnames(phdn)%in%c('n')==F])
      nucmap=hmDN
      DGAnno_DN=checkout2_dn_HM
      
      rownames(DGAnno_DN)=DGAnno_DN$n
      # rownames(DGAnno_DN)=DGAnno_DN$CLIP_ID
      DGAnno_DN=DGAnno_DN[rownames(nucmap),c('DeltaG','CLIP_ID','qgrs_maxScore','Counts_Unique','Counts_Multimappers_Scaled','Counts_Multimappers_All'),drop=F]
      
            hmDN=merge(DGAnno_DN,hmDN,by="row.names")
            hmDN=hmDN[order(hmDN$diff,decreasing = T),]
            hmDN=hmDN[hmDN$CLIP_ID%in%rownames(hm),]
            rownames(hmDN)=hmDN[,'CLIP_ID']
                  split=as.matrix(hmDN$diff<.2)

      DGAnno_DN=hmDN[,colnames(hmDN)%in%c('Row.names','DeltaG','CLIP_ID','qgrs_maxScore','diff','Counts_Unique','Counts_Multimappers_Scaled','Counts_Multimappers_All')]
      hmDN=as.matrix(hmDN[,colnames(hmDN)%in%c('Row.names','DeltaG','CLIP_ID','qgrs_maxScore','diff','Counts_Unique','Counts_Multimappers_Scaled','Counts_Multimappers_All')==F])
            
      DGAnno_DN$DeltaG=-as.numeric(DGAnno_DN$DeltaG)
      # DGAnno_DN=rowAnnotation(DeltaG = row_anno_barplot(as.numeric(DGAnno_DN$DeltaG)))
      Anno_DN=rowAnnotation(DeltaG = row_anno_barplot(as.numeric(DGAnno_DN$DeltaG)),
                            Gquad_score = row_anno_barplot(as.numeric(DGAnno_DN$qgrs_maxScore)),
                            ExonDist = row_anno_barplot(as.numeric(DGAnno_DN$downstream_exon_dist)),
                            # Counts_UniqueReads = row_anno_barplot(as.numeric(DGAnno_DN$Counts_Unique)),
                             Counts_ScaledMM = row_anno_barplot(as.numeric(DGAnno_DN$Counts_Multimappers_Scaled)),
                             # Counts_AllMM = row_anno_barplot(as.numeric(DGAnno_DN$Counts_Multimappers_All)),
                            annotation_name_rot=270)
## Warning in min(x): no non-missing arguments to min; returning Inf

## Warning in min(x): no non-missing arguments to max; returning -Inf
### Heatmap
      
     hmDN=hmDN[rownames(hm),]
  BPP_DN=ComplexHeatmap::Heatmap(hmDN,col = col_fun,name = "matBPP_DN", 
                                  column_title = "Downstream",column_title_side = "top",
                                 show_heatmap_legend = F,
                                  # row_title = paste0(CLIPtype," CLIP peaks"),row_title_side = "left",
                                 show_row_names=F, row_names_gp = gpar(fontsize = 8),
                                  cluster_columns = F,cluster_rows =F,
                                   right_annotation = Anno_DN, 
                              column_labels = Gname,
                          width = unit(w, "in"),heatmap_height = unit(h, "cm"), column_names_gp = gpar(fontsize = 8)#,
                                     )

  ht_list = BPP_UP + BPP_DN
          
          # BPP_UP
          # BPP_DN
           draw(ht_list, ht_gap = unit(1, "in"),main_heatmap = "matBPP_UP",column_title = "Distance from CLIP peak",column_title_side="bottom")

           # 
           # draw(ht_list[1:length(split[split[,]==F,])], ht_gap = unit(1, "in"),main_heatmap = "matBPP_UP",column_title = "Distance from CLIP peak",column_title_side="bottom",heatmap_height = unit(30, "cm"),heatmap_width = unit(7, "cm"))
           
           draw(ht_list, ht_gap = unit(1, "in"),main_heatmap = "matBPP_UP",column_title_side="bottom",heatmap_height = unit((20*(nrow(ph[ph$diff>meanBPP,])/nrow(ph))), "cm"),heatmap_width = unit(7, "cm"))
## 'heatmap_width' should not be set in draw() for horizontal heatmap list
## (Note a single heatmap is a horizontal heatmap list). Please directly
## set it in `Heatmap()`.

  # Heatmap(ht_list[1:length(split[split[,]==F,])])
     op= row_order(BPP_UP)
## Warning: The heatmap has not been initialized. You might have different results
## if you repeatedly execute this function. It is more suggested to do as
## `ht = draw(ht); row_order(ht)`.
for (clst in 1:length(op)) {
     
      opn=rownames(hm)[unlist(op[[clst]])]
 
     
     opavg=cbind(colMeans(hm[opn,c(ncol(hm):1)]),colMeans(hmDN[opn,]))
     colnames(opavg)=c("Upstream","Downstream")
     opavg=melt(opavg)
     colnames(opavg)=c("distance","nucleotide","value")
     
     print(ggplot(opavg,aes(y=value,x=distance,line=nucleotide,color=nucleotide))+
                 xlab("Position of Window\n(Distance from 5'Clip peak)")+ylab('Average basepairing probabilty')+theme_classic()+ggtitle(paste0('Cluster ',clst,': window size=',ch,' - upstream/downstream 100nt'))+
      geom_line(na.rm=F)#+ylim(.0,.6) 
      )
   }

Local Structure compared to Random

      BPPfracRando=as.matrix(read.delim(paste0(inRandom,'/Random1/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/upstream/Random_outchunck_frac_100chunk4.txt'),comment.char = "%",header = F,sep = "\t",stringsAsFactors = F))
            for (rand in c(2:12)) {

      BPPfracRando2=as.matrix(read.delim(paste0(inRandom,'/Random',rand,'/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/upstream/Random_outchunck_frac_100chunk4.txt'),comment.char = "%",header = F,sep = "\t",stringsAsFactors = F))

 BPPfracRando=rbind(BPPfracRando,BPPfracRando2) 
            }
 ###################################

 BPPfracRando=as.data.frame(BPPfracRando[,1]/BPPfracRando[,2])
 colnames(BPPfracRando)='frac_high_BPP'
 
 
 
 ggplot(BPPfracRando,aes(x=frac_high_BPP))+geom_density(size= .6) + theme_classic()+ylab("Density")+xlab("Fraction of Locations with with local BPP > 0.2")+ggtitle('Upstream: Fraction of Locations with High Local BasePairing Probabilities' )+
       geom_vline(xintercept=sum(as.numeric(checkout2$RowMax_MeanChuckBPP)>.2,na.rm = T)/nrow(checkout2),colour="red") 
## Warning: Removed 257 rows containing non-finite values (stat_density).

 ################################### ################################### ###################################
      BPPfracRando_dn=as.matrix(read.delim(paste0(inRandom,'/Random1/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/downstream/Random_dn_outchunck_frac_100chunk4.txt'),comment.char = "%",header = F,sep = "\t",stringsAsFactors = F))
            for (rand in c(2:12)) {

      BPPfracRando2_dn=as.matrix(read.delim(paste0(inRandom,'/Random',rand,'/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/downstream/Random_dn_outchunck_frac_100chunk4.txt'),comment.char = "%",header = F,sep = "\t",stringsAsFactors = F))
      
 BPPfracRando_dn=rbind(BPPfracRando_dn,BPPfracRando2_dn)       
            }
########
 
 BPPfracRando_dn=as.data.frame(BPPfracRando_dn[,1]/BPPfracRando_dn[,2])
 colnames(BPPfracRando_dn)='frac_high_BPP'
 
 
 
 ggplot(BPPfracRando_dn,aes(x=frac_high_BPP))+geom_density(size= .6) + theme_classic()+ylab("Density")+xlab("Fraction of Locations with with local BPP > 0.2")+ggtitle('Downstream: Fraction of Locations with High Local BasePairing Probabilities' )+
       geom_vline(xintercept=sum(as.numeric(checkout2_dn$RowMax_MeanChuckBPP)>.2,na.rm = T)/nrow(checkout2_dn),colour="red") 
## Warning: Removed 257 rows containing non-finite values (stat_density).

      BPPavgRando=as.matrix(read.delim(paste0(inRandom,'/Random1/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/upstream/Random_outchunck_rowaverage_100chuck4.txt'),comment.char = "%",header = T,row.names=1,sep = "\t",stringsAsFactors = F))
      
      for (rand in c(2:12)) {

      BPPavgRando2=as.matrix(read.delim(paste0(inRandom,'/Random',rand,'/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/upstream/Random_outchunck_rowaverage_100chuck4.txt'),comment.char = "%",header = T,row.names = 1, sep = "\t",stringsAsFactors = F))
      
       BPPavgRando=rbind(as.data.frame(BPPavgRando),
                            as.data.frame(BPPavgRando2))
      
      }
     
      
BPPavgRando=BPPavgRando[is.na(BPPavgRando[,1])==F,]
# BPPavgRando=as.numeric(as.character(BPPavgRando))
  colnames(BPPavgRando)=seq(-window,(ncol(BPPavgRando)-(window+1)),1)
 rownames(BPPavgRando)=paste0('n',rownames(BPPavgRando))
 BPPavgRando= melt(as.matrix(BPPavgRando))
       colnames(BPPavgRando)=c('Random','location','avg_high_BPP')
BPPavgRando$avg_high_BPP=as.numeric(as.character(BPPavgRando$avg_high_BPP))
 
   
 CLIP=(ph[ph$diff>.2,])
 CLIP=(colMeans(CLIP[,colnames(CLIP)%in%c('diff','n')==F]))
 CLIP=as.data.frame(rbind(CLIP,colnames(CLIP)))
 CLIP=t(CLIP)
 CLIP=as.data.frame(cbind(CLIP,rownames(CLIP)))
   colnames(CLIP)=c('avg_high_BPP','location')
CLIP$avg_high_BPP=as.numeric(as.matrix(CLIP$avg_high_BPP))
 CLIP$location=as.numeric(as.matrix(CLIP$location))
CLIP$Random='upstream'


ggplot(BPPavgRando,aes(y=avg_high_BPP,x=location,line=Random))+
   geom_line(data=BPPavgRando,size= .6,color='grey') +
  geom_line(data=CLIP,size= .6,color='red')+
  theme_classic()+ylab("Average Basepairing Probability")+xlab("Position of 10nt Window")+ggtitle('Upstream: Average BPP of Locations with local BPP >0.2')+ylim(0,.3)

########################################################################################################
########################################################################################################
########################################################################################################

      BPPavgRando_dn=as.matrix(read.delim(paste0(inRandom,'/Random1/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/downstream/Random_dn_outchunck_rowaverage_100chucnk4.txt'),comment.char = "%",header = T,row.names=1,sep = "\t",stringsAsFactors = F))

for (rand in c(2:12)) {
  
      BPPavgRando_dn2=as.matrix(read.delim(paste0(inRandom,'/Random',rand,'/',CLIPtype,'/',TrasncLoc,'/',RunMode,'/downstream/Random_dn_outchunck_rowaverage_100chucnk4.txt'),comment.char = "%",header = T,row.names = 1, sep = "\t",stringsAsFactors = F))
      
      BPPavgRando_dn=rbind(as.data.frame(BPPavgRando_dn),
                            as.data.frame(BPPavgRando_dn2))
      
      }
       

BPPavgRando_dn=BPPavgRando_dn[is.na(BPPavgRando_dn[,1])==F,]
# BPPavgRando_dn=as.numeric(as.character(BPPavgRando_dn))
  colnames(BPPavgRando_dn)=seq(1,ncol(BPPavgRando_dn),1)
 rownames(BPPavgRando_dn)=paste0('n',rownames(BPPavgRando_dn))
 BPPavgRando_dn= melt(as.matrix(BPPavgRando_dn))
       colnames(BPPavgRando_dn)=c('Random','location','avg_high_BPP')
BPPavgRando_dn$avg_high_BPP=as.numeric(as.character(BPPavgRando_dn$avg_high_BPP))
 
 CLIP=(phdn[phdn$diff>.2,])
 CLIP=CLIP[,colnames(CLIP)%in%c('diff','n')==F]
  colnames(CLIP)=seq(1,length(CLIP),1)
 CLIP=(colMeans(CLIP[,colnames(CLIP)%in%c('diff','n')==F]))
 CLIP=as.data.frame(rbind(CLIP,colnames(CLIP)))
 CLIP=t(CLIP)
 CLIP=as.data.frame(cbind(CLIP,rownames(CLIP)))
   colnames(CLIP)=c('avg_high_BPP','location')
CLIP$avg_high_BPP=as.numeric(as.matrix(CLIP$avg_high_BPP))
 CLIP$location=as.numeric(as.matrix(CLIP$location))
CLIP$Random='downstream'


# ggplot(BPPavgRando_dn1,aes(y=avg_high_BPP,x=location,line=Random))+
#    geom_line(data=BPPavgRando_dn1,size= .6,color='grey') +
#      geom_line(data=BPPavgRando_dn2,size= .6,color='grey') +
#    geom_line(data=BPPavgRando_dn3,size= .6,color='grey') +
#    geom_line(data=BPPavgRando_dn4,size= .6,color='grey') +
#   geom_line(data=CLIP,size= .6,color='red')+
#   theme_classic()+ylab("Average Basepairing Probability")+xlab("Position of 10nt Window")+ggtitle('Downsteam: Average BPP of Locations with local BPP >0.2')+ylim(0,.3)


ggplot(BPPavgRando_dn,aes(y=avg_high_BPP,x=location,line=Random))+
   geom_line(data=BPPavgRando_dn,size= .6,color='grey') +
  geom_line(data=CLIP,size= .6,color='red')+
  theme_classic()+ylab("Average Basepairing Probability")+xlab("Position of 10nt Window")+ggtitle('Downsteam: Average BPP of Locations with local BPP >0.2')+ylim(0,.3)

Structure/Count Correlations

Upstream

##########################
## Upstream
##########################

   sctrplot=checkout2_HM
  sctrplot[is.na(sctrplot$qgrs_maxScore),'qgrs_maxScore']=0
  sctrplot$qgrs_maxScore=as.numeric(sctrplot$qgrs_maxScore)  
  sctrplot$DeltaG=-sctrplot$DeltaG
  ggscatter(sctrplot, x = 'qgrs_maxScore', y = 'DeltaG', add = "reg.line",color = "red",   
          add.params = list(color = "black", fill = NA), # Customize reg. line
          xlab=expression("GQuad score (qgrs)" ), 
           ylab='CLIP peak Upstream (100nt) \n-DeltaG'
) +
 stat_cor(
   aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),
  label.x = 30,label.y=.65
)
## `geom_smooth()` using formula 'y ~ x'

  ggscatter(sctrplot, x = 'qgrs_maxScore', y = 'RowMax_MeanChuckBPP', add = "reg.line",color = "red",   
          add.params = list(color = "black", fill = NA), # Customize reg. line
          xlab=expression("GQuad score (qgrs)" ), 
           ylab='CLIP peak maximum 10nt window Base pairing probability'
) +
 stat_cor(
   aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),
  label.x = 30,label.y=.65
)
## `geom_smooth()` using formula 'y ~ x'

  #########  #########  #########  #########  #########  #########  #########  #########  #########  #########

 ggscatter(sctrplot, x = 'qgrs_maxScore', y = 'Counts_Multimappers_Scaled', add = "reg.line",color = "red",   
          add.params = list(color = "black", fill = NA), # Customize reg. line
          xlab=expression("CLIP peak Upstream (100nt) \nGQuad score (qgrs)" ), 
           ylab='CLIP peak Read Count (scaled MultiMappers)'
) +
 stat_cor(
   aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~"))#,
  # label.x = 30,label.y=.65
)  
## `geom_smooth()` using formula 'y ~ x'

   ggscatter(sctrplot, x = 'DeltaG', y = 'Counts_Multimappers_Scaled', add = "reg.line",color = "red",   
          add.params = list(color = "black", fill = NA), # Customize reg. line
          xlab=expression("CLIP peak Upstream (100nt) \n-DeltaG" ), 
           ylab='CLIP peak Read Count (scaled MultiMappers)'
) +
 stat_cor(
   aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~"))#,
  # label.x = 30,label.y=.65
)
## `geom_smooth()` using formula 'y ~ x'

 ggscatter(sctrplot, x = 'RowMax_MeanChuckBPP', y = 'Counts_Multimappers_Scaled', add = "reg.line",color = "red",   
          add.params = list(color = "black", fill = NA), # Customize reg. line
          xlab=expression("CLIP peak Upstream (100nt) \nmaximum 10nt window Base pairing probability" ), 
           ylab='CLIP peak Read Count (scaled MultiMappers)'
) +
 stat_cor(
   aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~"))#,
  # label.x = 30,label.y=.65
)
## `geom_smooth()` using formula 'y ~ x'

Downstream

##########################
## Downstream
##########################
  
  sctrplot_dn=checkout2_dn_HM
  sctrplot_dn[is.na(sctrplot_dn$qgrs_maxScore),'qgrs_maxScore']=0
  sctrplot_dn$qgrs_maxScore=as.numeric(sctrplot_dn$qgrs_maxScore)  
  sctrplot_dn$DeltaG=-sctrplot_dn$DeltaG
  ggscatter(sctrplot_dn, x = 'qgrs_maxScore', y = 'DeltaG', add = "reg.line",color = "red",   
          add.params = list(color = "black", fill = NA), # Customize reg. line
          xlab=expression("GQuad score (qgrs)" ), 
           ylab='CLIP peak downstream (100nt) \n-DeltaG'
) +
 stat_cor(
   aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),
  label.x = 10,label.y=.65
)
## `geom_smooth()` using formula 'y ~ x'

  ggscatter(sctrplot_dn, x = 'qgrs_maxScore', y = 'RowMax_MeanChuckBPP', add = "reg.line",color = "red",   
          add.params = list(color = "black", fill = NA), # Customize reg. line
          xlab=expression("GQuad score (qgrs)" ), 
           ylab='CLIP peak maximum 10nt window Base pairing probability'
) +
 stat_cor(
   aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~")),
  label.x = 30,label.y=.65
)
## `geom_smooth()` using formula 'y ~ x'

  #########  #########  #########  #########  #########  #########  #########  #########  #########  #########

 ggscatter(sctrplot_dn, x = 'qgrs_maxScore', y = 'Counts_Multimappers_Scaled', add = "reg.line",color = "red",   
          add.params = list(color = "black", fill = NA), # Customize reg. line
          xlab=expression("CLIP peak downstream (100nt) \nGQuad score (qgrs)" ), 
           ylab='CLIP peak Read Count (scaled MultiMappers)'
) +
 stat_cor(
   aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~"))#,
  # label.x = 30,label.y=.65
)  
## `geom_smooth()` using formula 'y ~ x'

   ggscatter(sctrplot_dn, x = 'DeltaG', y = 'Counts_Multimappers_Scaled', add = "reg.line",color = "red",   
          add.params = list(color = "black", fill = NA), # Customize reg. line
          xlab=expression("CLIP peak downstream (100nt) \n-DeltaG" ), 
           ylab='CLIP peak Read Count (scaled MultiMappers)'
) +
 stat_cor(
   aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~"))#,
  # label.x = 30,label.y=.65
)
## `geom_smooth()` using formula 'y ~ x'

 ggscatter(sctrplot_dn, x = 'RowMax_MeanChuckBPP', y = 'Counts_Multimappers_Scaled', add = "reg.line",color = "red",   
          add.params = list(color = "black", fill = NA), # Customize reg. line
          xlab=expression("CLIP peak downstream (100nt) \nmaximum 10nt window Base pairing probability" ), 
           ylab='CLIP peak Read Count (scaled MultiMappers)'
) +
 stat_cor(
   aes(label = paste(..rr.label.., ..p.label.., sep = "~`,`~"))#,
  # label.x = 30,label.y=.65
)
## `geom_smooth()` using formula 'y ~ x'

  # length(Gname)
  # ncol(hmDN)

Structure associated with LINE/SINE CLIP peaks

Select CLIP peaks that overlap with Introns on the same strand as CLIP peak. Identify CLIP peaks that overlap with LINE/SINE elements on either strand in 100nt window Upstream and Downstream.
Compare deltaG, local base paring probability and Gquad identification/score for Intronic regions.

Upstream Structure Associated With Intronic CLIP peaks that overlap with LINE/SINES

On either strand

################################################################################        
#         Does not exclude peaks thatoverlap with simplerepeats/Low_complexity on opposite strand,
#         but if CLIP-intron does overlap with simplerepeats/Low_complexity on same strand it is removed
        
## will include peaks that do not OL with intron on Same strand but does on opposite strand
LS1=Peaksdata4_chunkInExall[Peaksdata4_chunkInExall$`Both Strand: RNA_type_Classification`%in%c('protein_coding: exon')==F,] 
### Includes any peak that only overlaps with introns on same strand
LS2= Peaksdata4_chunkInExall[grepl('intron',Peaksdata4_chunkInExall$`Same Strand: Intron Exon`),]
### takes away CLIP peaks classified as opposite strand features and still could be interesting
LS3= Peaksdata4_chunkInExall[Peaksdata4_chunkInExall$`Both Strand: RNA_type_Classification`%in%c('protein_coding: exon','Antisense Feature')==F,]

################################################################################        

LISIplot=merge(checkout2[checkout2$CLIP_ID%in%LS3$ID,],isoout[,c('ID','LISI_OLup','strand_LISIup','LISI_OLdn','strand_LISIdn')],by.x='CLIP_ID',by.y='ID')
LISIplot$LISI_OLup=gsub(1,"OL with LISI",LISIplot$LISI_OLup)
LISIplot[is.na(LISIplot$LISI_OLup),'LISI_OLup']="No overlap"
LISIplot$qgrs_maxScore=as.numeric(LISIplot$qgrs_maxScore)
LISIplot[is.na(LISIplot$qgrs_maxScore),'qgrs_maxScore']=0
LISIplot=merge(LISIplot,Peaksdata4[,c('ID','Counts_Unique','Counts_Multimappers_Scaled','Counts_Multimappers_BestMapping')],by.x='CLIP_ID',by.y='ID')



ggplot(LISIplot,aes(x=RowMax_MeanChuckBPP,line=LISI_OLup,color=LISI_OLup))+
   geom_density(size= .6) +
  theme_classic()+xlab("Maximum Window Basepairing Probability")+ylab("Density")+ggtitle('Upstream: Maximum BPP of 10nt Chuck')

ggplot(LISIplot,aes(x=-DeltaG,line=LISI_OLup,color=LISI_OLup))+
   geom_density(size= .6) +
  theme_classic()+xlab("-Delta G")+ylab("Density")+ggtitle('Upstream: Folding Energy of 100nt ')

ggplot(LISIplot,aes(x=qgrs_maxScore,line=LISI_OLup,color=LISI_OLup))+
   geom_density(size= .6) +
  theme_classic()+xlab("-Delta G")+ylab("Density")+ggtitle('Upstream: Gquad Score ')

ggplot(LISIplot,aes(x=Counts_Unique,line=LISI_OLup,color=LISI_OLup))+
   geom_density(size= .6) +
  theme_classic()+xlab("CLIP counts:Unique")+ylab("Density")+ggtitle('Upstream: CLIP counts:Unique ')

ggplot(LISIplot,aes(x=Counts_Multimappers_Scaled,line=LISI_OLup,color=LISI_OLup))+
   geom_density(size= .6) +
  theme_classic()+xlab("CLIP counts:Unique")+ylab("Density")+ggtitle('Upstream: CLIP counts:Multimappers_Scaled ')

On same strand

p=LISIplot[-which(LISIplot$strand==LISIplot$strand_LISIup),]

ggplot(p,aes(x=RowMax_MeanChuckBPP,line=LISI_OLup,color=LISI_OLup))+
   geom_density(size= .6) +
  theme_classic()+xlab("Maximum Window Basepairing Probability")+ylab("Density")+ggtitle('Upstream SameStrand: Maximum BPP of 10nt Chuck')

ggplot(p,aes(x=-DeltaG,line=LISI_OLup,color=LISI_OLup))+
   geom_density(size= .6) +
  theme_classic()+xlab("-Delta G")+ylab("Density")+ggtitle('Upstream SameStrand: Folding Energy of 100nt ')

ggplot(p,aes(x=qgrs_maxScore,line=LISI_OLup,color=LISI_OLup))+
   geom_density(size= .6) +
  theme_classic()+xlab("-Delta G")+ylab("Density")+ggtitle('Upstream SameStrand: Gquad Score ')

ggplot(p,aes(x=Counts_Unique,line=LISI_OLup,color=LISI_OLup))+
   geom_density(size= .6) +
  theme_classic()+xlab("CLIP counts:Unique")+ylab("Density")+ggtitle('Upstream SameStrand: CLIP counts:Unique ')

ggplot(p,aes(x=Counts_Multimappers_Scaled,line=LISI_OLup,color=LISI_OLup))+
   geom_density(size= .6) +
  theme_classic()+xlab("CLIP counts:Unique")+ylab("Density")+ggtitle('Upstream SameStrand: CLIP counts:Multimappers_Scaled ')

Downstream Structure Associated With Intronic CLIP peaks that overlap with LINE/SINES

On either strand

LISIplot_dn=merge(checkout2_dn[checkout2_dn$CLIP_ID%in%LS3$ID,],isoout[,c('ID','LISI_OLup','strand_LISIup','LISI_OLdn','strand_LISIdn')],by.x='CLIP_ID',by.y='ID')
LISIplot_dn$LISI_OLdn=gsub(1,"OL with LISI",LISIplot_dn$LISI_OLdn)
LISIplot_dn[is.na(LISIplot_dn$LISI_OLdn),'LISI_OLdn']="No overlap"
LISIplot_dn$qgrs_maxScore=as.numeric(LISIplot_dn$qgrs_maxScore)
LISIplot_dn[is.na(LISIplot_dn$qgrs_maxScore),'qgrs_maxScore']=0
LISIplot_dn=merge(LISIplot_dn,Peaksdata4[,c('ID','Counts_Unique','Counts_Multimappers_Scaled','Counts_Multimappers_BestMapping')],by.x='CLIP_ID',by.y='ID')


ggplot(LISIplot_dn,aes(x=RowMax_MeanChuckBPP,line=LISI_OLdn,color=LISI_OLdn))+
   geom_density(size= .6) +
  theme_classic()+xlab("Maximum Window Basepairing Probability")+ylab("Density")+ggtitle('Downstream: Maximum BPP of 10nt Chuck')

ggplot(LISIplot_dn,aes(x=-DeltaG,line=LISI_OLdn,color=LISI_OLdn))+
   geom_density(size= .6) +
  theme_classic()+xlab("-Delta G")+ylab("Density")+ggtitle('Downstream: Folding Energy of 100nt ')

ggplot(LISIplot_dn,aes(x=qgrs_maxScore,line=LISI_OLdn,color=LISI_OLdn))+
   geom_density(size= .6) +
  theme_classic()+xlab("-Delta G")+ylab("Density")+ggtitle('Downstream: Gquad Score ')

ggplot(LISIplot_dn,aes(x=Counts_Unique,line=LISI_OLdn,color=LISI_OLdn))+
   geom_density(size= .6) +
  theme_classic()+xlab("CLIP counts:Unique")+ylab("Density")+ggtitle('Downstream: CLIP counts:Unique ')

ggplot(LISIplot_dn,aes(x=Counts_Multimappers_Scaled,line=LISI_OLdn,color=LISI_OLdn))+
   geom_density(size= .6) +
  theme_classic()+xlab("CLIP counts:Unique")+ylab("Density")+ggtitle('Downstream: CLIP counts:Multimappers_Scaled ')

On same strand

pdn=LISIplot_dn[-which(LISIplot_dn$strand==LISIplot_dn$strand_LISIdn),]

ggplot(pdn,aes(x=RowMax_MeanChuckBPP,line=LISI_OLdn,color=LISI_OLdn))+
   geom_density(size= .6) +
  theme_classic()+xlab("Maximum Window Basepairing Probability")+ylab("Density")+ggtitle('Downstream SameStrand: Maximum BPP of 10nt Chuck')

ggplot(pdn,aes(x=-DeltaG,line=LISI_OLdn,color=LISI_OLdn))+
   geom_density(size= .6) +
  theme_classic()+xlab("-Delta G")+ylab("Density")+ggtitle('Downstream SameStrand: Folding Energy of 100nt ')

ggplot(pdn,aes(x=qgrs_maxScore,line=LISI_OLdn,color=LISI_OLdn))+
   geom_density(size= .6) +
  theme_classic()+xlab("-Delta G")+ylab("Density")+ggtitle('Downstream SameStrand: Gquad Score ')

ggplot(pdn,aes(x=Counts_Unique,line=LISI_OLdn,color=LISI_OLdn))+
   geom_density(size= .6) +
  theme_classic()+xlab("CLIP counts:Unique")+ylab("Density")+ggtitle('Downstream SameStrand: CLIP counts:Unique ')

ggplot(pdn,aes(x=Counts_Multimappers_Scaled,line=LISI_OLdn,color=LISI_OLdn))+
   geom_density(size= .6) +
  theme_classic()+xlab("CLIP counts:Unique")+ylab("Density")+ggtitle('Downstream SameStrand: CLIP counts:Multimappers_Scaled ')

DISTANCE TO SINE/LINE

CLIP peak 100nt Upstream/Downstream regions were selected from full transcript with introns

Any CLIP peaks that overlap with intronic regions were included in in Intronic CLIP peak analysis. If Peak also overlaps with a Repeat Element it is removed, except for peaks that overlap with LINE, SINE or LTR.

All LINE/SINE in same transcript as CLIP peak , or on the opposite strands, were considered for closest LINE/SINE element

Random peaks were taken from Intronic regions only

INTRONS

NEW

Include same only Strand LISI

Include Opposite only Strand LISI

Include Opposite and same Strand

create mmRNAome

UTR

old

SINE/LINE Transcriptomic Distance

LISI locations

Distibutioin of CLIP Transcript location

Random CLIP locations

Create output file